diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 9863ff087ca86..c375fa5dc7516 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -499,6 +499,7 @@ clang:static analyzer: - clang/tools/scan-build/** - clang/utils/analyzer/** - clang/docs/analyzer/** + - clang/test/Analysis/** pgo: - llvm/lib/Transforms/Instrumentation/CGProfile.cpp diff --git a/clang-tools-extra/clangd/CollectMacros.cpp b/clang-tools-extra/clangd/CollectMacros.cpp index 96298ee3ea50a..1e7d765f0b6f1 100644 --- a/clang-tools-extra/clangd/CollectMacros.cpp +++ b/clang-tools-extra/clangd/CollectMacros.cpp @@ -18,10 +18,13 @@ namespace clang { namespace clangd { -Range MacroOccurrence::toRange(const SourceManager &SM) const { +CharSourceRange MacroOccurrence::toSourceRange(const SourceManager &SM) const { auto MainFile = SM.getMainFileID(); - return halfOpenToRange( - SM, syntax::FileRange(MainFile, StartOffset, EndOffset).toCharRange(SM)); + return syntax::FileRange(MainFile, StartOffset, EndOffset).toCharRange(SM); +} + +Range MacroOccurrence::toRange(const SourceManager &SM) const { + return halfOpenToRange(SM, toSourceRange(SM)); } void CollectMainFileMacros::add(const Token &MacroNameTok, const MacroInfo *MI, diff --git a/clang-tools-extra/clangd/CollectMacros.h b/clang-tools-extra/clangd/CollectMacros.h index e7198641d8d53..20a3fc24d759c 100644 --- a/clang-tools-extra/clangd/CollectMacros.h +++ b/clang-tools-extra/clangd/CollectMacros.h @@ -31,6 +31,7 @@ struct MacroOccurrence { // True if the occurence is used in a conditional directive, e.g. #ifdef MACRO bool InConditionalDirective; + CharSourceRange toSourceRange(const SourceManager &SM) const; Range toRange(const SourceManager &SM) const; }; diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp index 1a23f6cca7756..8b9fffa3f64cd 100644 --- a/clang-tools-extra/clangd/XRefs.cpp +++ b/clang-tools-extra/clangd/XRefs.cpp @@ -372,6 +372,15 @@ void enhanceLocatedSymbolsFromIndex(llvm::MutableArrayRef Result, }); } +bool objcMethodIsTouched(const SourceManager &SM, const ObjCMethodDecl *OMD, + SourceLocation Loc) { + unsigned NumSels = OMD->getNumSelectorLocs(); + for (unsigned I = 0; I < NumSels; ++I) + if (SM.getSpellingLoc(OMD->getSelectorLoc(I)) == Loc) + return true; + return false; +} + // Decls are more complicated. // The AST contains at least a declaration, maybe a definition. // These are up-to-date, and so generally preferred over index results. @@ -430,6 +439,26 @@ locateASTReferent(SourceLocation CurLoc, const syntax::Token *TouchedIdentifier, continue; } } + // Special case: - (void)^method {} should jump to overrides, but the decl + // shouldn't, only the definition. Note that an Objective-C method can + // override a parent class or protocol. + // + // FIXME: Support jumping from a protocol decl to overrides on go-to + // definition. + if (const auto *OMD = llvm::dyn_cast(D)) { + if (OMD->isThisDeclarationADefinition() && TouchedIdentifier && + objcMethodIsTouched(SM, OMD, TouchedIdentifier->location())) { + llvm::SmallVector Overrides; + OMD->getOverriddenMethods(Overrides); + if (!Overrides.empty()) { + for (const auto *Override : Overrides) + AddResultDecl(Override); + LocateASTReferentMetric.record(1, "objc-overriden-method"); + } + AddResultDecl(OMD); + continue; + } + } // Special case: the cursor is on an alias, prefer other results. // This targets "using ns::^Foo", where the target is more interesting. @@ -1283,6 +1312,12 @@ std::vector findImplementations(ParsedAST &AST, Position Pos, } else if (const auto *RD = dyn_cast(ND)) { IDs.insert(getSymbolID(RD)); QueryKind = RelationKind::BaseOf; + } else if (const auto *OMD = dyn_cast(ND)) { + IDs.insert(getSymbolID(OMD)); + QueryKind = RelationKind::OverriddenBy; + } else if (const auto *ID = dyn_cast(ND)) { + IDs.insert(getSymbolID(ID)); + QueryKind = RelationKind::BaseOf; } } return findImplementors(std::move(IDs), QueryKind, Index, AST.tuPath()); @@ -1302,6 +1337,21 @@ void getOverriddenMethods(const CXXMethodDecl *CMD, } } +// Recursively finds all the overridden methods of `OMD` in complete type +// hierarchy. +void getOverriddenMethods(const ObjCMethodDecl *OMD, + llvm::DenseSet &OverriddenMethods) { + if (!OMD) + return; + llvm::SmallVector Overrides; + OMD->getOverriddenMethods(Overrides); + for (const ObjCMethodDecl *Base : Overrides) { + if (auto ID = getSymbolID(Base)) + OverriddenMethods.insert(ID); + getOverriddenMethods(Base, OverriddenMethods); + } +} + std::optional stringifyContainerForMainFileRef(const Decl *Container) { // FIXME We might also want to display the signature here @@ -1438,6 +1488,12 @@ ReferencesResult findReferences(ParsedAST &AST, Position Pos, uint32_t Limit, getOverriddenMethods(CMD, OverriddenMethods); } } + // Special case: Objective-C methods can override a parent class or + // protocol, we should be sure to report references to those. + if (const auto *OMD = llvm::dyn_cast(ND)) { + OverriddenBy.Subjects.insert(getSymbolID(OMD)); + getOverriddenMethods(OMD, OverriddenMethods); + } } } diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index 1de7faf81746e..3f5633357073d 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -713,7 +713,8 @@ void SymbolCollector::handleMacros(const MainFileMacros &MacroRefsToIndex) { // Add macro references. for (const auto &IDToRefs : MacroRefsToIndex.MacroRefs) { for (const auto &MacroRef : IDToRefs.second) { - const auto &Range = MacroRef.toRange(SM); + const auto &SR = MacroRef.toSourceRange(SM); + auto Range = halfOpenToRange(SM, SR); bool IsDefinition = MacroRef.IsDefinition; Ref R; R.Location.Start.setLine(Range.start.line); @@ -726,9 +727,7 @@ void SymbolCollector::handleMacros(const MainFileMacros &MacroRefsToIndex) { if (IsDefinition) { Symbol S; S.ID = IDToRefs.first; - auto StartLoc = cantFail(sourceLocationInMainFile(SM, Range.start)); - auto EndLoc = cantFail(sourceLocationInMainFile(SM, Range.end)); - S.Name = toSourceCode(SM, SourceRange(StartLoc, EndLoc)); + S.Name = toSourceCode(SM, SR.getAsRange()); S.SymInfo.Kind = index::SymbolKind::Macro; S.SymInfo.SubKind = index::SymbolSubKind::None; S.SymInfo.Properties = index::SymbolPropertySet(); diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp index 7a9703c744e93..1ce28c91a420c 100644 --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -1335,6 +1335,42 @@ TEST_F(SymbolCollectorTest, OverrideRelationsMultipleInheritance) { OverriddenBy(CBar, DBar), OverriddenBy(CBaz, DBaz))); } +TEST_F(SymbolCollectorTest, ObjCOverrideRelationsSimpleInheritance) { + std::string Header = R"cpp( + @interface A + - (void)foo; + @end + @interface B : A + - (void)foo; // A::foo + - (void)bar; + @end + @interface C : B + - (void)bar; // B::bar + @end + @interface D : C + - (void)foo; // B::foo + - (void)bar; // C::bar + @end + )cpp"; + runSymbolCollector(Header, /*Main=*/"", + {"-xobjective-c++", "-Wno-objc-root-class"}); + const Symbol &AFoo = findSymbol(Symbols, "A::foo"); + const Symbol &BFoo = findSymbol(Symbols, "B::foo"); + const Symbol &DFoo = findSymbol(Symbols, "D::foo"); + + const Symbol &BBar = findSymbol(Symbols, "B::bar"); + const Symbol &CBar = findSymbol(Symbols, "C::bar"); + const Symbol &DBar = findSymbol(Symbols, "D::bar"); + + std::vector Result; + for (const Relation &R : Relations) + if (R.Predicate == RelationKind::OverriddenBy) + Result.push_back(R); + EXPECT_THAT(Result, UnorderedElementsAre( + OverriddenBy(AFoo, BFoo), OverriddenBy(BBar, CBar), + OverriddenBy(BFoo, DFoo), OverriddenBy(CBar, DBar))); +} + TEST_F(SymbolCollectorTest, CountReferences) { const std::string Header = R"( class W; diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp index 7d824d659ad2c..475b56b1dc230 100644 --- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp +++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp @@ -411,6 +411,85 @@ TEST(LocateSymbol, FindOverrides) { sym("foo", Code.range("2"), std::nullopt))); } +TEST(LocateSymbol, FindOverridesFromDefObjC) { + auto Code = Annotations(R"objc( + @protocol Fooey + - (void)foo; + @end + @interface Base + - (void)foo; + @end + @interface Foo : Base + - (void)$1[[foo]]; + @end + + @interface Bar : Foo + - (void)$2[[foo]]; + @end + @implementation Bar + - (void)$3[[fo^o]] {} + @end + )objc"); + TestTU TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + EXPECT_THAT( + locateSymbolAt(AST, Code.point(), TU.index().get()), + UnorderedElementsAre(sym("foo", Code.range("1"), std::nullopt), + sym("foo", Code.range("2"), Code.range("3")))); +} + +TEST(LocateSymbol, NoOverridesFromDeclObjC) { + auto Code = Annotations(R"objc( + @protocol Fooey + - (void)foo; + @end + @interface Base + - (void)foo; + @end + @interface Foo : Base + - (void)foo; + @end + + @interface Bar : Foo + - (void)$2[[fo^o]]; + @end + @implementation Bar + - (void)$3[[foo]] {} + @end + )objc"); + TestTU TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + EXPECT_THAT( + locateSymbolAt(AST, Code.point(), TU.index().get()), + UnorderedElementsAre(sym("foo", Code.range("2"), Code.range("3")))); +} + +TEST(LocateSymbol, ObjCNoOverridesOnUsage) { + auto Code = Annotations(R"objc( + @interface Foo + - (void)foo; + @end + + @interface Bar : Foo + - (void)$1[[foo]]; + @end + @implementation Bar + - (void)$2[[foo]] {} + @end + void doSomething(Bar *bar) { + [bar fo^o]; + } + )objc"); + TestTU TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + EXPECT_THAT( + locateSymbolAt(AST, Code.point(), TU.index().get()), + UnorderedElementsAre(sym("foo", Code.range("1"), Code.range("2")))); +} + TEST(LocateSymbol, WithIndexPreferredLocation) { Annotations SymbolHeader(R"cpp( class $p[[Proto]] {}; @@ -1834,6 +1913,41 @@ TEST(FindImplementations, Inheritance) { } } +TEST(FindImplementations, InheritanceObjC) { + llvm::StringRef Test = R"objc( + @interface $base^Base + - (void)fo$foo^o; + @end + @protocol Protocol + - (void)$protocol^protocol; + @end + @interface $ChildDecl[[Child]] : Base + - (void)concrete; + - (void)$fooDecl[[foo]]; + @end + @implementation $ChildDef[[Child]] + - (void)concrete {} + - (void)$fooDef[[foo]] {} + - (void)$protocolDef[[protocol]] {} + @end + )objc"; + + Annotations Code(Test); + auto TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + auto Index = TU.index(); + EXPECT_THAT(findImplementations(AST, Code.point("base"), Index.get()), + UnorderedElementsAre(sym("Child", Code.range("ChildDecl"), + Code.range("ChildDef")))); + EXPECT_THAT(findImplementations(AST, Code.point("foo"), Index.get()), + UnorderedElementsAre( + sym("foo", Code.range("fooDecl"), Code.range("fooDef")))); + EXPECT_THAT(findImplementations(AST, Code.point("protocol"), Index.get()), + UnorderedElementsAre(sym("protocol", Code.range("protocolDef"), + Code.range("protocolDef")))); +} + TEST(FindImplementations, CaptureDefinition) { llvm::StringRef Test = R"cpp( struct Base { @@ -1963,6 +2077,7 @@ void checkFindRefs(llvm::StringRef Test, bool UseIndex = false) { Annotations T(Test); auto TU = TestTU::withCode(T.code()); TU.ExtraArgs.push_back("-std=c++20"); + TU.ExtraArgs.push_back("-xobjective-c++"); auto AST = TU.build(); std::vector> ExpectedLocations; @@ -2260,6 +2375,25 @@ TEST(FindReferences, IncludeOverrides) { checkFindRefs(Test, /*UseIndex=*/true); } +TEST(FindReferences, IncludeOverridesObjC) { + llvm::StringRef Test = + R"objc( + @interface Base + - (void)$decl(Base)[[f^unc]]; + @end + @interface Derived : Base + - (void)$overridedecl(Derived::func)[[func]]; + @end + @implementation Derived + - (void)$overridedef[[func]] {} + @end + void test(Derived *derived, Base *base) { + [derived func]; // No references to the overrides. + [base $(test)[[func]]]; + })objc"; + checkFindRefs(Test, /*UseIndex=*/true); +} + TEST(FindReferences, RefsToBaseMethod) { llvm::StringRef Test = R"cpp( @@ -2284,6 +2418,27 @@ TEST(FindReferences, RefsToBaseMethod) { checkFindRefs(Test, /*UseIndex=*/true); } +TEST(FindReferences, RefsToBaseMethodObjC) { + llvm::StringRef Test = + R"objc( + @interface BaseBase + - (void)$(BaseBase)[[func]]; + @end + @interface Base : BaseBase + - (void)$(Base)[[func]]; + @end + @interface Derived : Base + - (void)$decl(Derived)[[fu^nc]]; + @end + void test(BaseBase *bb, Base *b, Derived *d) { + // refs to overridden methods in complete type hierarchy are reported. + [bb $(test)[[func]]]; + [b $(test)[[func]]]; + [d $(test)[[fu^nc]]]; + })objc"; + checkFindRefs(Test, /*UseIndex=*/true); +} + TEST(FindReferences, MainFileReferencesOnly) { llvm::StringRef Test = R"cpp( diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a91c764860ccd..5780f5d61d579 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -170,6 +170,7 @@ Bug Fixes to C++ Support - Clang is now better at keeping track of friend function template instance contexts. (#GH55509) - The initialization kind of elements of structured bindings direct-list-initialized from an array is corrected to direct-initialization. +- Clang no longer crashes when a coroutine is declared ``[[noreturn]]``. (#GH127327) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index ff4f236c1fa88..0f98d237dcbcd 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -4579,25 +4579,97 @@ class ShuffleVectorExpr : public Expr { /// ConvertVectorExpr - Clang builtin function __builtin_convertvector /// This AST node provides support for converting a vector type to another /// vector type of the same arity. -class ConvertVectorExpr : public Expr { +class ConvertVectorExpr final + : public Expr, + private llvm::TrailingObjects { private: Stmt *SrcExpr; TypeSourceInfo *TInfo; SourceLocation BuiltinLoc, RParenLoc; + friend TrailingObjects; friend class ASTReader; friend class ASTStmtReader; - explicit ConvertVectorExpr(EmptyShell Empty) : Expr(ConvertVectorExprClass, Empty) {} + explicit ConvertVectorExpr(bool HasFPFeatures, EmptyShell Empty) + : Expr(ConvertVectorExprClass, Empty) { + ConvertVectorExprBits.HasFPFeatures = HasFPFeatures; + } -public: ConvertVectorExpr(Expr *SrcExpr, TypeSourceInfo *TI, QualType DstType, ExprValueKind VK, ExprObjectKind OK, - SourceLocation BuiltinLoc, SourceLocation RParenLoc) + SourceLocation BuiltinLoc, SourceLocation RParenLoc, + FPOptionsOverride FPFeatures) : Expr(ConvertVectorExprClass, DstType, VK, OK), SrcExpr(SrcExpr), TInfo(TI), BuiltinLoc(BuiltinLoc), RParenLoc(RParenLoc) { + ConvertVectorExprBits.HasFPFeatures = FPFeatures.requiresTrailingStorage(); + if (hasStoredFPFeatures()) + setStoredFPFeatures(FPFeatures); setDependence(computeDependence(this)); } + size_t numTrailingObjects(OverloadToken) const { + return ConvertVectorExprBits.HasFPFeatures ? 1 : 0; + } + + FPOptionsOverride &getTrailingFPFeatures() { + assert(ConvertVectorExprBits.HasFPFeatures); + return *getTrailingObjects(); + } + + const FPOptionsOverride &getTrailingFPFeatures() const { + assert(ConvertVectorExprBits.HasFPFeatures); + return *getTrailingObjects(); + } + +public: + static ConvertVectorExpr *CreateEmpty(const ASTContext &C, + bool hasFPFeatures); + + static ConvertVectorExpr *Create(const ASTContext &C, Expr *SrcExpr, + TypeSourceInfo *TI, QualType DstType, + ExprValueKind VK, ExprObjectKind OK, + SourceLocation BuiltinLoc, + SourceLocation RParenLoc, + FPOptionsOverride FPFeatures); + + /// Get the FP contractibility status of this operator. Only meaningful for + /// operations on floating point types. + bool isFPContractableWithinStatement(const LangOptions &LO) const { + return getFPFeaturesInEffect(LO).allowFPContractWithinStatement(); + } + + /// Is FPFeatures in Trailing Storage? + bool hasStoredFPFeatures() const { + return ConvertVectorExprBits.HasFPFeatures; + } + + /// Get FPFeatures from trailing storage. + FPOptionsOverride getStoredFPFeatures() const { + return getTrailingFPFeatures(); + } + + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + + /// Set FPFeatures in trailing storage, used by Serialization & ASTImporter. + void setStoredFPFeatures(FPOptionsOverride F) { getTrailingFPFeatures() = F; } + + /// Get the FP features status of this operator. Only meaningful for + /// operations on floating point types. + FPOptions getFPFeaturesInEffect(const LangOptions &LO) const { + if (ConvertVectorExprBits.HasFPFeatures) + return getStoredFPFeatures().applyOverrides(LO); + return FPOptions::defaultWithoutTrailingStorage(LO); + } + + FPOptionsOverride getFPOptionsOverride() const { + if (ConvertVectorExprBits.HasFPFeatures) + return getStoredFPFeatures(); + return FPOptionsOverride(); + } + /// getSrcExpr - Return the Expr to be converted. Expr *getSrcExpr() const { return cast(SrcExpr); } diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index 405c6166adb15..604ac51d478cf 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -1215,6 +1215,20 @@ class alignas(void *) Stmt { SourceLocation Loc; }; + class ConvertVectorExprBitfields { + friend class ConvertVectorExpr; + + LLVM_PREFERRED_TYPE(ExprBitfields) + unsigned : NumExprBits; + + // + /// This is only meaningful for operations on floating point + /// types when additional values need to be in trailing storage. + /// It is 0 otherwise. + LLVM_PREFERRED_TYPE(bool) + unsigned HasFPFeatures : 1; + }; + union { // Same order as in StmtNodes.td. // Statements @@ -1293,6 +1307,7 @@ class alignas(void *) Stmt { // Clang Extensions OpaqueValueExprBitfields OpaqueValueExprBits; + ConvertVectorExprBitfields ConvertVectorExprBits; }; public: diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h index 4b5ad2b5fa74c..81844db2c77fa 100644 --- a/clang/include/clang/AST/TextNodeDumper.h +++ b/clang/include/clang/AST/TextNodeDumper.h @@ -425,6 +425,7 @@ class TextNodeDumper void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S); void VisitEmbedExpr(const EmbedExpr *S); void VisitAtomicExpr(const AtomicExpr *AE); + void VisitConvertVectorExpr(const ConvertVectorExpr *S); }; } // namespace clang diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 0e5df338dd2e5..0e8b0189540bd 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4771,6 +4771,12 @@ def HLSLAll : LangBuiltin<"HLSL_LANG"> { let Prototype = "bool(...)"; } +def HLSLAnd : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_and"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void(...)"; +} + def HLSLAny : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_any"]; let Attributes = [NoThrow, Const]; diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index ee1ad214d81df..feef50812eca9 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -713,13 +713,14 @@ def err_thread_non_global : Error< def err_thread_unsupported : Error< "thread-local storage is not supported for the current target">; -// FIXME: Combine fallout warnings to just one warning. -def warn_maybe_falloff_nonvoid_function : Warning< - "non-void function does not return a value in all control paths">, - InGroup; -def warn_falloff_nonvoid_function : Warning< - "non-void function does not return a value">, +def warn_falloff_nonvoid : Warning< + "non-void " + "%enum_select{%Function{function}|%Block{block}|%Lambda{lambda}|%Coroutine{coroutine}}0" + " does not return a value%select{| in all control paths}1">, InGroup; +def err_falloff_nonvoid : Error< + "non-void %select{function|block|lambda|coroutine}0 " + "does not return a value%select{| in all control paths}1">; def warn_const_attr_with_pure_attr : Warning< "'const' attribute imposes more restrictions; 'pure' attribute ignored">, InGroup; @@ -727,16 +728,6 @@ def warn_pure_function_returns_void : Warning< "'%select{pure|const}0' attribute on function returning 'void'; attribute ignored">, InGroup; -def err_maybe_falloff_nonvoid_block : Error< - "non-void block does not return a value in all control paths">; -def err_falloff_nonvoid_block : Error< - "non-void block does not return a value">; -def warn_maybe_falloff_nonvoid_coroutine : Warning< - "non-void coroutine does not return a value in all control paths">, - InGroup; -def warn_falloff_nonvoid_coroutine : Warning< - "non-void coroutine does not return a value">, - InGroup; def warn_suggest_noreturn_function : Warning< "%select{function|method}0 %1 could be declared with attribute 'noreturn'">, InGroup, DefaultIgnore; @@ -8406,14 +8397,6 @@ let CategoryName = "Lambda Issue" in { "lambda expression in default argument cannot capture any entity">; def err_lambda_incomplete_result : Error< "incomplete result type %0 in lambda expression">; - def err_noreturn_lambda_has_return_expr : Error< - "lambda declared 'noreturn' should not return">; - def warn_maybe_falloff_nonvoid_lambda : Warning< - "non-void lambda does not return a value in all control paths">, - InGroup; - def warn_falloff_nonvoid_lambda : Warning< - "non-void lambda does not return a value">, - InGroup; def err_access_lambda_capture : Error< // The ERRORs represent other special members that aren't constructors, in // hopes that someone will bother noticing and reporting if they appear @@ -10603,14 +10586,16 @@ def err_ctor_dtor_returns_void : Error< def warn_noreturn_function_has_return_expr : Warning< "function %0 declared 'noreturn' should not return">, InGroup; -def warn_falloff_noreturn_function : Warning< - "function declared 'noreturn' should not return">, +def warn_noreturn_has_return_expr : Warning< + "%select{function|block|lambda|coroutine}0 " + "declared 'noreturn' should not return">, InGroup; +def err_noreturn_has_return_expr : Error< + "%select{function|block|lambda|coroutine}0 " + "declared 'noreturn' should not return">; def warn_noreturn_coroutine : Warning< "coroutine %0 cannot be declared 'noreturn' as it always returns a coroutine handle">, InGroup; -def err_noreturn_block_has_return_expr : Error< - "block declared 'noreturn' should not return">; def err_carries_dependency_missing_on_first_decl : Error< "%select{function|parameter}0 declared '[[carries_dependency]]' " "after its first declaration">; diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td index 305a06427ed0e..73759cfa9c3c9 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td @@ -28,6 +28,8 @@ def CIR_Dialect : Dialect { let useDefaultTypePrinterParser = 0; let extraClassDeclaration = [{ + static llvm::StringRef getTripleAttrName() { return "cir.triple"; } + void registerAttributes(); void registerTypes(); diff --git a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h index 5f9110bc83b89..99495f4718c5f 100644 --- a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h +++ b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h @@ -25,8 +25,11 @@ class CIRGenConsumer; class CIRGenAction : public clang::ASTFrontendAction { public: enum class OutputType { + EmitAssembly, EmitCIR, EmitLLVM, + EmitBC, + EmitObj, }; private: @@ -63,6 +66,27 @@ class EmitLLVMAction : public CIRGenAction { EmitLLVMAction(mlir::MLIRContext *MLIRCtx = nullptr); }; +class EmitBCAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitBCAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +class EmitAssemblyAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitAssemblyAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +class EmitObjAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitObjAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + } // namespace cir #endif diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index fb8e445cb4b72..8489619832a47 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -135,7 +135,16 @@ module Clang_Frontend { module Clang_FrontendTool { requires cplusplus umbrella "clang/FrontendTool" module * { export * } } module Clang_Index { requires cplusplus umbrella "clang/Index" module * { export * } } -module Clang_Lex { requires cplusplus umbrella "clang/Lex" module * { export * } } + +module Clang_Lex { + requires cplusplus + umbrella "clang/Lex" + + textual header "clang/Lex/HLSLRootSignatureTokenKinds.def" + + module * { export * } +} + module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } } module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } } module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } } diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index c27ebbf838ad1..43da76e14d0a3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -7386,9 +7386,10 @@ ExpectedStmt ASTNodeImporter::VisitConvertVectorExpr(ConvertVectorExpr *E) { if (Err) return std::move(Err); - return new (Importer.getToContext()) - ConvertVectorExpr(ToSrcExpr, ToTSI, ToType, E->getValueKind(), - E->getObjectKind(), ToBuiltinLoc, ToRParenLoc); + return ConvertVectorExpr::Create( + Importer.getToContext(), ToSrcExpr, ToTSI, ToType, E->getValueKind(), + E->getObjectKind(), ToBuiltinLoc, ToRParenLoc, + E->getStoredFPFeaturesOrDefault()); } ExpectedStmt ASTNodeImporter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 503c58a67adeb..a35aa9471a73d 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3397,7 +3397,8 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { CtorFunc = getFunction(CE->getConstructor()); if (!CtorFunc) return false; - } + } else if (!DynamicInit) + DynamicInit = Init; LabelTy EndLabel = this->getLabel(); LabelTy StartLabel = this->getLabel(); diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index ca74046038072..fa113aa0bb157 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1132,13 +1132,14 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) { const Pointer &P = S.Stk.peek(); ComparisonCategoryResult CmpResult = LHS.compare(RHS); - if (CmpResult == ComparisonCategoryResult::Unordered) { - // This should only happen with pointers. - const SourceInfo &Loc = S.Current->getSource(OpPC); - S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified) - << LHS.toDiagnosticString(S.getASTContext()) - << RHS.toDiagnosticString(S.getASTContext()); - return false; + if constexpr (std::is_same_v) { + if (CmpResult == ComparisonCategoryResult::Unordered) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified) + << LHS.toDiagnosticString(S.getASTContext()) + << RHS.toDiagnosticString(S.getASTContext()); + return false; + } } assert(CmpInfo); diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 7fb89bf5b499f..63caf04f7ef38 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -786,12 +786,16 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( QualType T, bool ParameterPack, TypeSourceInfo *TInfo) { AutoType *AT = C.getLangOpts().CPlusPlus20 ? T->getContainedAutoType() : nullptr; - return new (C, DC, - additionalSizeToAlloc, - Expr *>(0, - AT && AT->isConstrained() ? 1 : 0)) - NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, ParameterPack, - TInfo); + const bool HasConstraint = AT && AT->isConstrained(); + auto *NTTP = + new (C, DC, + additionalSizeToAlloc, Expr *>( + 0, HasConstraint ? 1 : 0)) + NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, + ParameterPack, TInfo); + if (HasConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( @@ -800,23 +804,30 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( QualType T, TypeSourceInfo *TInfo, ArrayRef ExpandedTypes, ArrayRef ExpandedTInfos) { AutoType *AT = TInfo->getType()->getContainedAutoType(); - return new (C, DC, - additionalSizeToAlloc, - Expr *>( - ExpandedTypes.size(), AT && AT->isConstrained() ? 1 : 0)) - NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo, - ExpandedTypes, ExpandedTInfos); + const bool HasConstraint = AT && AT->isConstrained(); + auto *NTTP = + new (C, DC, + additionalSizeToAlloc, Expr *>( + ExpandedTypes.size(), HasConstraint ? 1 : 0)) + NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo, + ExpandedTypes, ExpandedTInfos); + if (HasConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl * NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, bool HasTypeConstraint) { - return new (C, ID, additionalSizeToAlloc, - Expr *>(0, - HasTypeConstraint ? 1 : 0)) + auto *NTTP = + new (C, ID, + additionalSizeToAlloc, Expr *>( + 0, HasTypeConstraint ? 1 : 0)) NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0, nullptr, QualType(), false, nullptr); + if (HasTypeConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl * @@ -830,6 +841,8 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0, nullptr, QualType(), nullptr, {}, {}); NTTP->NumExpandedTypes = NumExpandedTypes; + if (HasTypeConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); return NTTP; } diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 1f949d495f343..b747aa8df807d 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -3911,6 +3911,8 @@ FPOptions Expr::getFPFeaturesInEffect(const LangOptions &LO) const { return BO->getFPFeaturesInEffect(LO); if (auto Cast = dyn_cast(this)) return Cast->getFPFeaturesInEffect(LO); + if (auto ConvertVector = dyn_cast(this)) + return ConvertVector->getFPFeaturesInEffect(LO); return FPOptions::defaultWithoutTrailingStorage(LO); } @@ -5451,3 +5453,21 @@ OpenACCAsteriskSizeExpr * OpenACCAsteriskSizeExpr::CreateEmpty(const ASTContext &C) { return new (C) OpenACCAsteriskSizeExpr({}, C.IntTy); } + +ConvertVectorExpr *ConvertVectorExpr::CreateEmpty(const ASTContext &C, + bool hasFPFeatures) { + void *Mem = C.Allocate(totalSizeToAlloc(hasFPFeatures), + alignof(ConvertVectorExpr)); + return new (Mem) ConvertVectorExpr(hasFPFeatures, EmptyShell()); +} + +ConvertVectorExpr *ConvertVectorExpr::Create( + const ASTContext &C, Expr *SrcExpr, TypeSourceInfo *TI, QualType DstType, + ExprValueKind VK, ExprObjectKind OK, SourceLocation BuiltinLoc, + SourceLocation RParenLoc, FPOptionsOverride FPFeatures) { + bool HasFPFeatures = FPFeatures.requiresTrailingStorage(); + unsigned Size = totalSizeToAlloc(HasFPFeatures); + void *Mem = C.Allocate(Size, alignof(ConvertVectorExpr)); + return new (Mem) ConvertVectorExpr(SrcExpr, TI, DstType, VK, OK, BuiltinLoc, + RParenLoc, FPFeatures); +} diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 26493caa5d06a..fd1eaab9621dd 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -3069,3 +3069,9 @@ void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) { void TextNodeDumper::VisitAtomicExpr(const AtomicExpr *AE) { OS << ' ' << AE->getOpAsString(); } + +void TextNodeDumper::VisitConvertVectorExpr(const ConvertVectorExpr *S) { + VisitStmt(S); + if (S->hasStoredFPFeatures()) + printFPOptions(S->getStoredFPFeatures()); +} diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, - GFX(940), // gfx940 - GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 84a05cec04e7f..e4d3ad04fe9de 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1109,6 +1109,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasFloat128) Builder.defineMacro("__SIZEOF_FLOAT128__", "16"); + + if (Opts.CFProtectionReturn || Opts.CFProtectionBranch) + Builder.defineMacro("__CET__", Twine{(Opts.CFProtectionReturn << 1) | + Opts.CFProtectionBranch}); } bool X86TargetInfo::isValidFeatureName(StringRef Name) const { diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 2615ae382cb8b..cbecdf925aa5d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -52,6 +52,9 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, DoubleTy = cir::DoubleType::get(&getMLIRContext()); FP80Ty = cir::FP80Type::get(&getMLIRContext()); FP128Ty = cir::FP128Type::get(&getMLIRContext()); + + theModule->setAttr(cir::CIRDialect::getTripleAttrName(), + builder.getStringAttr(getTriple().str())); } mlir::Location CIRGenModule::getLoc(SourceLocation cLoc) { diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 1c7ed63773900..29bb4036218e4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -21,7 +21,9 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" #include "llvm/ADT/StringRef.h" +#include "llvm/TargetParser/Triple.h" namespace clang { class ASTContext; @@ -88,6 +90,8 @@ class CIRGenModule : public CIRGenTypeCache { void emitGlobalVarDefinition(const clang::VarDecl *vd, bool isTentative = false); + const llvm::Triple &getTriple() const { return target.getTriple(); } + /// Helpers to emit "not yet implemented" error diagnostics DiagnosticBuilder errorNYI(SourceLocation, llvm::StringRef); diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp index eab6958ac8f6d..0f686a36b982b 100644 --- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp +++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp @@ -27,8 +27,14 @@ getBackendActionFromOutputType(CIRGenAction::OutputType Action) { assert(false && "Unsupported output type for getBackendActionFromOutputType!"); break; // Unreachable, but fall through to report that + case CIRGenAction::OutputType::EmitAssembly: + return BackendAction::Backend_EmitAssembly; + case CIRGenAction::OutputType::EmitBC: + return BackendAction::Backend_EmitBC; case CIRGenAction::OutputType::EmitLLVM: return BackendAction::Backend_EmitLL; + case CIRGenAction::OutputType::EmitObj: + return BackendAction::Backend_EmitObj; } // We should only get here if a non-enum value is passed in or we went through // the assert(false) case above @@ -84,7 +90,10 @@ class CIRGenConsumer : public clang::ASTConsumer { MlirModule->print(*OutputStream, Flags); } break; - case CIRGenAction::OutputType::EmitLLVM: { + case CIRGenAction::OutputType::EmitLLVM: + case CIRGenAction::OutputType::EmitBC: + case CIRGenAction::OutputType::EmitObj: + case CIRGenAction::OutputType::EmitAssembly: { llvm::LLVMContext LLVMCtx; std::unique_ptr LLVMModule = lowerFromCIRToLLVMIR(MlirModule, LLVMCtx); @@ -111,10 +120,16 @@ static std::unique_ptr getOutputStream(CompilerInstance &CI, StringRef InFile, CIRGenAction::OutputType Action) { switch (Action) { + case CIRGenAction::OutputType::EmitAssembly: + return CI.createDefaultOutputFile(false, InFile, "s"); case CIRGenAction::OutputType::EmitCIR: return CI.createDefaultOutputFile(false, InFile, "cir"); case CIRGenAction::OutputType::EmitLLVM: return CI.createDefaultOutputFile(false, InFile, "ll"); + case CIRGenAction::OutputType::EmitBC: + return CI.createDefaultOutputFile(true, InFile, "bc"); + case CIRGenAction::OutputType::EmitObj: + return CI.createDefaultOutputFile(true, InFile, "o"); } llvm_unreachable("Invalid CIRGenAction::OutputType"); } @@ -132,6 +147,10 @@ CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { return Result; } +void EmitAssemblyAction::anchor() {} +EmitAssemblyAction::EmitAssemblyAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitAssembly, MLIRCtx) {} + void EmitCIRAction::anchor() {} EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) : CIRGenAction(OutputType::EmitCIR, MLIRCtx) {} @@ -139,3 +158,11 @@ EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) void EmitLLVMAction::anchor() {} EmitLLVMAction::EmitLLVMAction(mlir::MLIRContext *MLIRCtx) : CIRGenAction(OutputType::EmitLLVM, MLIRCtx) {} + +void EmitBCAction::anchor() {} +EmitBCAction::EmitBCAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitBC, MLIRCtx) {} + +void EmitObjAction::anchor() {} +EmitObjAction::EmitObjAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitObj, MLIRCtx) {} diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 74ff89346f3c4..235b5a057852a 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -114,6 +114,8 @@ struct ConvertCIRToLLVMPass } void runOnOperation() final; + void processCIRAttrs(mlir::ModuleOp module); + StringRef getDescription() const override { return "Convert the prepared CIR dialect module to LLVM dialect"; } @@ -271,6 +273,13 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter, }); } +void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) { + // Lower the module attributes to LLVM equivalents. + if (auto tripleAttr = module->getAttr(cir::CIRDialect::getTripleAttrName())) + module->setAttr(mlir::LLVM::LLVMDialect::getTargetTripleAttrName(), + tripleAttr); +} + void ConvertCIRToLLVMPass::runOnOperation() { llvm::TimeTraceScope scope("Convert CIR to LLVM Pass"); @@ -283,6 +292,8 @@ void ConvertCIRToLLVMPass::runOnOperation() { patterns.add(converter, patterns.getContext(), dl); + processCIRAttrs(module); + mlir::ConversionTarget target(getContext()); target.addLegalOp(); target.addLegalDialect(); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 348cb523b1718..a73ba1ff138fb 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -859,6 +859,24 @@ static void emitSincosBuiltin(CodeGenFunction &CGF, const CallExpr *E, StoreCos->setMetadata(LLVMContext::MD_noalias, AliasScopeList); } +static llvm::Value *emitModfBuiltin(CodeGenFunction &CGF, const CallExpr *E, + llvm::Intrinsic::ID IntrinsicID) { + llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *IntPartDest = CGF.EmitScalarExpr(E->getArg(1)); + + llvm::Value *Call = + CGF.Builder.CreateIntrinsic(IntrinsicID, {Val->getType()}, Val); + + llvm::Value *FractionalResult = CGF.Builder.CreateExtractValue(Call, 0); + llvm::Value *IntegralResult = CGF.Builder.CreateExtractValue(Call, 1); + + QualType DestPtrType = E->getArg(1)->getType()->getPointeeType(); + LValue IntegralLV = CGF.MakeNaturalAlignAddrLValue(IntPartDest, DestPtrType); + CGF.EmitStoreOfScalar(IntegralResult, IntegralLV); + + return FractionalResult; +} + /// EmitFAbs - Emit a call to @llvm.fabs(). static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) { Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType()); @@ -3377,11 +3395,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(emitUnaryMaybeConstrainedFPBuiltin( *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh)); + case Builtin::BIsincos: + case Builtin::BIsincosf: + case Builtin::BIsincosl: case Builtin::BI__builtin_sincos: case Builtin::BI__builtin_sincosf: case Builtin::BI__builtin_sincosf16: case Builtin::BI__builtin_sincosl: case Builtin::BI__builtin_sincosf128: + if (Builder.getIsFPConstrained()) + break; // TODO: Emit constrained sincos intrinsic once one exists. emitSincosBuiltin(*this, E, Intrinsic::sincos); return RValue::get(nullptr); @@ -4107,6 +4130,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_frexpf128: case Builtin::BI__builtin_frexpf16: return RValue::get(emitFrexpBuiltin(*this, E, Intrinsic::frexp)); + case Builtin::BImodf: + case Builtin::BImodff: + case Builtin::BImodfl: + case Builtin::BI__builtin_modf: + case Builtin::BI__builtin_modff: + case Builtin::BI__builtin_modfl: + if (Builder.getIsFPConstrained()) + break; // TODO: Emit constrained modf intrinsic once one exists. + return RValue::get(emitModfBuiltin(*this, E, Intrinsic::modf)); case Builtin::BI__builtin_isgreater: case Builtin::BI__builtin_isgreaterequal: case Builtin::BI__builtin_isless: @@ -19474,6 +19506,11 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, CGM.getHLSLRuntime().getAllIntrinsic(), ArrayRef{Op0}, nullptr, "hlsl.all"); } + case Builtin::BI__builtin_hlsl_and: { + Value *Op0 = EmitScalarExpr(E->getArg(0)); + Value *Op1 = EmitScalarExpr(E->getArg(1)); + return Builder.CreateAnd(Op0, Op1, "hlsl.and"); + } case Builtin::BI__builtin_hlsl_any: { Value *Op0 = EmitScalarExpr(E->getArg(0)); return Builder.CreateIntrinsic( diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 30f01496ba221..5ee8a1bfa8175 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1949,6 +1949,7 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { llvm::Value *Zero = llvm::Constant::getNullValue(SrcTy); if (SrcEltTy->isFloatingPointTy()) { + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); return Builder.CreateFCmpUNE(Src, Zero, "tobool"); } else { return Builder.CreateICmpNE(Src, Zero, "tobool"); @@ -1975,6 +1976,7 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { } else { assert(SrcEltTy->isFloatingPointTy() && DstEltTy->isFloatingPointTy() && "Unknown real conversion"); + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); if (DstEltTy->getTypeID() < SrcEltTy->getTypeID()) Res = Builder.CreateFPTrunc(Src, DstTy, "conv"); else diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index bae41fc06c036..d5a7fc7e85230 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -232,6 +232,11 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, } } + if (C.getDriver().isUsingLTO()) { + if (Arg *A = tools::getLastProfileSampleUseArg(Args)) + CmdArgs.push_back(Args.MakeArgString(std::string("-lto-sample-profile:") + + A->getValue())); + } Args.AddAllArgValues(CmdArgs, options::OPT__SLASH_link); // Control Flow Guard checks diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index b9a5c0589ebc4..4eb743acf327f 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4776,17 +4776,6 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args, } } - // Add the __CET__ macro if a CFProtection option is set. - if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { - StringRef Name = A->getValue(); - if (Name == "branch") - Opts.addMacroDef("__CET__=1"); - else if (Name == "return") - Opts.addMacroDef("__CET__=2"); - else if (Name == "full") - Opts.addMacroDef("__CET__=3"); - } - // Add macros from the command line. for (const auto *A : Args.filtered(OPT_D, OPT_U)) { if (A->getOption().matches(OPT_D)) diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index c8d004163b96d..bb3bb0aac78bf 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -62,8 +62,18 @@ CreateFrontendBaseAction(CompilerInstance &CI) { return std::make_unique(); case DumpRawTokens: return std::make_unique(); case DumpTokens: return std::make_unique(); - case EmitAssembly: return std::make_unique(); - case EmitBC: return std::make_unique(); + case EmitAssembly: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); + case EmitBC: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); case EmitCIR: #if CLANG_ENABLE_CIR return std::make_unique(); @@ -80,7 +90,12 @@ CreateFrontendBaseAction(CompilerInstance &CI) { } case EmitLLVMOnly: return std::make_unique(); case EmitCodeGenOnly: return std::make_unique(); - case EmitObj: return std::make_unique(); + case EmitObj: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); case ExtractAPI: return std::make_unique(); case FixIt: return std::make_unique(); diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index d1f5fdff8b600..f03b620eee142 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -249,6 +249,28 @@ bool all(double3); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_all) bool all(double4); +//===----------------------------------------------------------------------===// +// and builtins +//===----------------------------------------------------------------------===// + +/// \fn bool and(bool x, bool y) +/// \brief Logically ands two boolean vectors elementwise and produces a bool +/// vector output. + +// TODO: Clean up clang-format marker once we've resolved +// https://github.com/llvm/llvm-project/issues/127851 +// +// clang-format off +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool and(bool x, bool y); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool2 and(bool2 x, bool2 y); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool3 and(bool3 x, bool3 y); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool4 and(bool4 x, bool4 y); +// clang-format on + //===----------------------------------------------------------------------===// // any builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index ce7d9be8d2faa..f21e571e6e0ce 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -544,25 +544,17 @@ static ControlFlowKind CheckFallThrough(AnalysisDeclContext &AC) { namespace { struct CheckFallThroughDiagnostics { - unsigned diag_MaybeFallThrough_HasNoReturn; - unsigned diag_MaybeFallThrough_ReturnsNonVoid; - unsigned diag_AlwaysFallThrough_HasNoReturn; - unsigned diag_AlwaysFallThrough_ReturnsNonVoid; - unsigned diag_NeverFallThroughOrReturn; - enum { Function, Block, Lambda, Coroutine } funMode; + unsigned diag_FallThrough_HasNoReturn = 0; + unsigned diag_FallThrough_ReturnsNonVoid = 0; + unsigned diag_NeverFallThroughOrReturn = 0; + unsigned FunKind; // TODO: use diag::FalloffFunctionKind SourceLocation FuncLoc; static CheckFallThroughDiagnostics MakeForFunction(const Decl *Func) { CheckFallThroughDiagnostics D; D.FuncLoc = Func->getLocation(); - D.diag_MaybeFallThrough_HasNoReturn = - diag::warn_falloff_noreturn_function; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::warn_maybe_falloff_nonvoid_function; - D.diag_AlwaysFallThrough_HasNoReturn = - diag::warn_falloff_noreturn_function; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::warn_falloff_nonvoid_function; + D.diag_FallThrough_HasNoReturn = diag::warn_noreturn_has_return_expr; + D.diag_FallThrough_ReturnsNonVoid = diag::warn_falloff_nonvoid; // Don't suggest that virtual functions be marked "noreturn", since they // might be overridden by non-noreturn functions. @@ -576,76 +568,49 @@ struct CheckFallThroughDiagnostics { isTemplateInstantiation = Function->isTemplateInstantiation(); if (!isVirtualMethod && !isTemplateInstantiation) - D.diag_NeverFallThroughOrReturn = - diag::warn_suggest_noreturn_function; - else - D.diag_NeverFallThroughOrReturn = 0; + D.diag_NeverFallThroughOrReturn = diag::warn_suggest_noreturn_function; - D.funMode = Function; + D.FunKind = diag::FalloffFunctionKind::Function; return D; } static CheckFallThroughDiagnostics MakeForCoroutine(const Decl *Func) { CheckFallThroughDiagnostics D; D.FuncLoc = Func->getLocation(); - D.diag_MaybeFallThrough_HasNoReturn = 0; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::warn_maybe_falloff_nonvoid_coroutine; - D.diag_AlwaysFallThrough_HasNoReturn = 0; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::warn_falloff_nonvoid_coroutine; - D.diag_NeverFallThroughOrReturn = 0; - D.funMode = Coroutine; + D.diag_FallThrough_ReturnsNonVoid = diag::warn_falloff_nonvoid; + D.FunKind = diag::FalloffFunctionKind::Coroutine; return D; } static CheckFallThroughDiagnostics MakeForBlock() { CheckFallThroughDiagnostics D; - D.diag_MaybeFallThrough_HasNoReturn = - diag::err_noreturn_block_has_return_expr; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::err_maybe_falloff_nonvoid_block; - D.diag_AlwaysFallThrough_HasNoReturn = - diag::err_noreturn_block_has_return_expr; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::err_falloff_nonvoid_block; - D.diag_NeverFallThroughOrReturn = 0; - D.funMode = Block; + D.diag_FallThrough_HasNoReturn = diag::err_noreturn_has_return_expr; + D.diag_FallThrough_ReturnsNonVoid = diag::err_falloff_nonvoid; + D.FunKind = diag::FalloffFunctionKind::Block; return D; } static CheckFallThroughDiagnostics MakeForLambda() { CheckFallThroughDiagnostics D; - D.diag_MaybeFallThrough_HasNoReturn = - diag::err_noreturn_lambda_has_return_expr; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::warn_maybe_falloff_nonvoid_lambda; - D.diag_AlwaysFallThrough_HasNoReturn = - diag::err_noreturn_lambda_has_return_expr; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::warn_falloff_nonvoid_lambda; - D.diag_NeverFallThroughOrReturn = 0; - D.funMode = Lambda; + D.diag_FallThrough_HasNoReturn = diag::err_noreturn_has_return_expr; + D.diag_FallThrough_ReturnsNonVoid = diag::warn_falloff_nonvoid; + D.FunKind = diag::FalloffFunctionKind::Lambda; return D; } bool checkDiagnostics(DiagnosticsEngine &D, bool ReturnsVoid, bool HasNoReturn) const { - if (funMode == Function) { + if (FunKind == diag::FalloffFunctionKind::Function) { return (ReturnsVoid || - D.isIgnored(diag::warn_maybe_falloff_nonvoid_function, - FuncLoc)) && + D.isIgnored(diag::warn_falloff_nonvoid, FuncLoc)) && (!HasNoReturn || - D.isIgnored(diag::warn_noreturn_function_has_return_expr, - FuncLoc)) && + D.isIgnored(diag::warn_noreturn_has_return_expr, FuncLoc)) && (!ReturnsVoid || D.isIgnored(diag::warn_suggest_noreturn_block, FuncLoc)); } - if (funMode == Coroutine) { + if (FunKind == diag::FalloffFunctionKind::Coroutine) { return (ReturnsVoid || - D.isIgnored(diag::warn_maybe_falloff_nonvoid_function, FuncLoc) || - D.isIgnored(diag::warn_maybe_falloff_nonvoid_coroutine, - FuncLoc)) && + D.isIgnored(diag::warn_falloff_nonvoid, FuncLoc)) && (!HasNoReturn); } // For blocks / lambdas. @@ -662,12 +627,10 @@ struct CheckFallThroughDiagnostics { static void CheckFallThroughForBody(Sema &S, const Decl *D, const Stmt *Body, QualType BlockType, const CheckFallThroughDiagnostics &CD, - AnalysisDeclContext &AC, - sema::FunctionScopeInfo *FSI) { + AnalysisDeclContext &AC) { bool ReturnsVoid = false; bool HasNoReturn = false; - bool IsCoroutine = FSI->isCoroutine(); if (const auto *FD = dyn_cast(D)) { if (const auto *CBody = dyn_cast(Body)) @@ -696,49 +659,40 @@ static void CheckFallThroughForBody(Sema &S, const Decl *D, const Stmt *Body, if (CD.checkDiagnostics(Diags, ReturnsVoid, HasNoReturn)) return; SourceLocation LBrace = Body->getBeginLoc(), RBrace = Body->getEndLoc(); - auto EmitDiag = [&](SourceLocation Loc, unsigned DiagID) { - if (IsCoroutine) { - if (DiagID != 0) - S.Diag(Loc, DiagID) << FSI->CoroutinePromise->getType(); - } else { - S.Diag(Loc, DiagID); - } - }; // cpu_dispatch functions permit empty function bodies for ICC compatibility. if (D->getAsFunction() && D->getAsFunction()->isCPUDispatchMultiVersion()) return; // Either in a function body compound statement, or a function-try-block. - switch (CheckFallThrough(AC)) { - case UnknownFallThrough: - break; + switch (int FallThroughType = CheckFallThrough(AC)) { + case UnknownFallThrough: + break; - case MaybeFallThrough: - if (HasNoReturn) - EmitDiag(RBrace, CD.diag_MaybeFallThrough_HasNoReturn); - else if (!ReturnsVoid) - EmitDiag(RBrace, CD.diag_MaybeFallThrough_ReturnsNonVoid); - break; - case AlwaysFallThrough: - if (HasNoReturn) - EmitDiag(RBrace, CD.diag_AlwaysFallThrough_HasNoReturn); - else if (!ReturnsVoid) - EmitDiag(RBrace, CD.diag_AlwaysFallThrough_ReturnsNonVoid); - break; - case NeverFallThroughOrReturn: - if (ReturnsVoid && !HasNoReturn && CD.diag_NeverFallThroughOrReturn) { - if (const FunctionDecl *FD = dyn_cast(D)) { - S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 0 << FD; - } else if (const ObjCMethodDecl *MD = dyn_cast(D)) { - S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 1 << MD; - } else { - S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn); - } + case MaybeFallThrough: + case AlwaysFallThrough: + if (HasNoReturn) { + if (CD.diag_FallThrough_HasNoReturn) + S.Diag(RBrace, CD.diag_FallThrough_HasNoReturn) << CD.FunKind; + } else if (!ReturnsVoid && CD.diag_FallThrough_ReturnsNonVoid) { + bool NotInAllControlPaths = FallThroughType == MaybeFallThrough; + S.Diag(RBrace, CD.diag_FallThrough_ReturnsNonVoid) + << CD.FunKind << NotInAllControlPaths; + } + break; + case NeverFallThroughOrReturn: + if (ReturnsVoid && !HasNoReturn && CD.diag_NeverFallThroughOrReturn) { + if (const FunctionDecl *FD = dyn_cast(D)) { + S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 0 << FD; + } else if (const ObjCMethodDecl *MD = dyn_cast(D)) { + S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 1 << MD; + } else { + S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn); } - break; - case NeverFallThrough: - break; + } + break; + case NeverFallThrough: + break; } } @@ -2765,7 +2719,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( : (fscope->isCoroutine() ? CheckFallThroughDiagnostics::MakeForCoroutine(D) : CheckFallThroughDiagnostics::MakeForFunction(D))); - CheckFallThroughForBody(S, D, Body, BlockType, CD, AC, fscope); + CheckFallThroughForBody(S, D, Body, BlockType, CD, AC); } // Warning: check for unreachable code diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 8972957ded9f5..89e8082ee80e7 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -104,6 +104,7 @@ namespace { void CheckStaticCast(); void CheckDynamicCast(); void CheckCXXCStyleCast(bool FunctionalCast, bool ListInitialization); + bool CheckHLSLCStyleCast(CheckedConversionKind CCK); void CheckCStyleCast(); void CheckBuiltinBitCast(); void CheckAddrspaceCast(); @@ -2776,39 +2777,9 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, CheckedConversionKind CCK = FunctionalStyle ? CheckedConversionKind::FunctionalCast : CheckedConversionKind::CStyleCast; - - QualType SrcTy = SrcExpr.get()->getType(); - // This case should not trigger on regular vector cast, vector truncation - if (Self.getLangOpts().HLSL && - Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) { - if (SrcTy->isConstantArrayType()) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), - CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK); - Kind = CK_HLSLElementwiseCast; - return; - } - - // This case should not trigger on regular vector splat - // If the relative order of this and the HLSLElementWise cast checks - // are changed, it might change which cast handles what in a few cases - if (Self.getLangOpts().HLSL && - Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) { - const VectorType *VT = SrcTy->getAs(); - // change splat from vec1 case to splat from scalar - if (VT && VT->getNumElements() == 1) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), VT->getElementType(), CK_HLSLVectorTruncation, - SrcExpr.get()->getValueKind(), nullptr, CCK); - // Inserting a scalar cast here allows for a simplified codegen in - // the case the destTy is a vector - if (const VectorType *DVT = DestType->getAs()) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), DVT->getElementType(), - Self.PrepareScalarCast(SrcExpr, DVT->getElementType()), - SrcExpr.get()->getValueKind(), nullptr, CCK); - Kind = CK_HLSLAggregateSplatCast; - return; + if (Self.getLangOpts().HLSL) { + if (CheckHLSLCStyleCast(CCK)) + return; } if (ValueKind == VK_PRValue && !DestType->isRecordType() && @@ -2927,6 +2898,56 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, } } +// CheckHLSLCStyleCast - Returns `true` ihe cast is handled or errored as an +// HLSL-specific cast. Returns false if the cast should be checked as a CXX +// C-Style cast. +bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) { + assert(Self.getLangOpts().HLSL && "Must be HLSL!"); + QualType SrcTy = SrcExpr.get()->getType(); + // HLSL has several unique forms of C-style casts which support aggregate to + // aggregate casting. + // This case should not trigger on regular vector cast, vector truncation + if (Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) { + if (SrcTy->isConstantArrayType()) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), + CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK); + Kind = CK_HLSLElementwiseCast; + return true; + } + + // This case should not trigger on regular vector splat + // If the relative order of this and the HLSLElementWise cast checks + // are changed, it might change which cast handles what in a few cases + if (Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) { + const VectorType *VT = SrcTy->getAs(); + // change splat from vec1 case to splat from scalar + if (VT && VT->getNumElements() == 1) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), VT->getElementType(), CK_HLSLVectorTruncation, + SrcExpr.get()->getValueKind(), nullptr, CCK); + // Inserting a scalar cast here allows for a simplified codegen in + // the case the destTy is a vector + if (const VectorType *DVT = DestType->getAs()) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), DVT->getElementType(), + Self.PrepareScalarCast(SrcExpr, DVT->getElementType()), + SrcExpr.get()->getValueKind(), nullptr, CCK); + Kind = CK_HLSLAggregateSplatCast; + return true; + } + + // If the destination is an array, we've exhausted the valid HLSL casts, so we + // should emit a dignostic and stop processing. + if (DestType->isArrayType()) { + Self.Diag(OpRange.getBegin(), diag::err_bad_cxx_cast_generic) + << 4 << SrcTy << DestType; + SrcExpr = ExprError(); + return true; + } + return false; +} + /// DiagnoseBadFunctionCast - Warn whenever a function call is cast to a /// non-matching type. Such as enum function call to int, int call to /// pointer; etc. Cast to 'void' is an exception. diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index aae61f612a4bc..74f425d32648f 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5262,8 +5262,8 @@ ExprResult Sema::ConvertVectorExpr(Expr *E, TypeSourceInfo *TInfo, << E->getSourceRange()); } - return new (Context) class ConvertVectorExpr(E, TInfo, DstTy, VK, OK, - BuiltinLoc, RParenLoc); + return ConvertVectorExpr::Create(Context, E, TInfo, DstTy, VK, OK, BuiltinLoc, + RParenLoc, CurFPFeatureOverrides()); } bool Sema::BuiltinPrefetch(CallExpr *TheCall) { diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 957c3a0888438..20275ded8a561 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -2245,6 +2245,20 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { break; } + case Builtin::BI__builtin_hlsl_and: { + if (SemaRef.checkArgCount(TheCall, 2)) + return true; + if (CheckVectorElementCallArgs(&SemaRef, TheCall)) + return true; + if (CheckScalarOrVector(&SemaRef, TheCall, getASTContext().BoolTy, 0)) + return true; + + ExprResult A = TheCall->getArg(0); + QualType ArgTyA = A.get()->getType(); + // return type is the same as the input type + TheCall->setType(ArgTyA); + break; + } case Builtin::BI__builtin_hlsl_all: case Builtin::BI__builtin_hlsl_any: { if (SemaRef.checkArgCount(TheCall, 1)) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 6a76e6d74a4b0..a34005bf376aa 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6585,6 +6585,18 @@ void InitializationSequence::InitializeFrom(Sema &S, } } + if (S.getLangOpts().HLSL && Initializer && isa(DestAT)) { + QualType SrcType = Entity.getType(); + if (SrcType->isArrayParameterType()) + SrcType = + cast(SrcType)->getConstantArrayType(Context); + if (S.Context.hasSameUnqualifiedType(DestType, SrcType)) { + TryArrayCopy(S, Kind, Entity, Initializer, DestType, *this, + TreatUnavailableAsInvalid); + return; + } + } + // Some kinds of initialization permit an array to be initialized from // another array of the same type, and perform elementwise initialization. if (Initializer && isa(DestAT) && diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 0394edb7889ba..d0b713f074c33 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3590,7 +3590,8 @@ StmtResult Sema::ActOnCapScopeReturnStmt(SourceLocation ReturnLoc, if (auto *CurBlock = dyn_cast(CurCap)) { if (CurBlock->FunctionType->castAs()->getNoReturnAttr()) { - Diag(ReturnLoc, diag::err_noreturn_block_has_return_expr); + Diag(ReturnLoc, diag::err_noreturn_has_return_expr) + << diag::FalloffFunctionKind::Block; return StmtError(); } } else if (auto *CurRegion = dyn_cast(CurCap)) { @@ -3601,7 +3602,8 @@ StmtResult Sema::ActOnCapScopeReturnStmt(SourceLocation ReturnLoc, if (CurLambda->CallOperator->getType() ->castAs() ->getNoReturnAttr()) { - Diag(ReturnLoc, diag::err_noreturn_lambda_has_return_expr); + Diag(ReturnLoc, diag::err_noreturn_has_return_expr) + << diag::FalloffFunctionKind::Lambda; return StmtError(); } } diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 05cac8db3c42c..eaabfae2409f4 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -12716,7 +12716,7 @@ TreeTransform::TransformDeclRefExpr(DeclRefExpr *E) { ValueDecl *ND = cast_or_null(getDerived().TransformDecl(E->getLocation(), E->getDecl())); - if (!ND) + if (!ND || ND->isInvalidDecl()) return ExprError(); NamedDecl *Found = ND; diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index fba54023a6bb2..835ad4a658944 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -1387,10 +1387,15 @@ void ASTStmtReader::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { void ASTStmtReader::VisitConvertVectorExpr(ConvertVectorExpr *E) { VisitExpr(E); + bool HasFPFeatures = CurrentUnpackingBits->getNextBit(); + assert(HasFPFeatures == E->hasStoredFPFeatures()); E->BuiltinLoc = readSourceLocation(); E->RParenLoc = readSourceLocation(); E->TInfo = readTypeSourceInfo(); E->SrcExpr = Record.readSubExpr(); + if (HasFPFeatures) + E->setStoredFPFeatures( + FPOptionsOverride::getFromOpaqueInt(Record.readInt())); } void ASTStmtReader::VisitBlockExpr(BlockExpr *E) { @@ -3385,9 +3390,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { S = new (Context) ShuffleVectorExpr(Empty); break; - case EXPR_CONVERT_VECTOR: - S = new (Context) ConvertVectorExpr(Empty); + case EXPR_CONVERT_VECTOR: { + BitsUnpacker ConvertVectorExprBits(Record[ASTStmtReader::NumStmtFields]); + ConvertVectorExprBits.advance(ASTStmtReader::NumExprBits); + bool HasFPFeatures = ConvertVectorExprBits.getNextBit(); + S = ConvertVectorExpr::CreateEmpty(Context, HasFPFeatures); break; + } case EXPR_BLOCK: S = new (Context) BlockExpr(Empty); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index b25dadab656b0..ac80bb46afa2d 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2016,8 +2016,7 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { // For an expanded parameter pack, record the number of expansion types here // so that it's easier for deserialization to allocate the right amount of // memory. - Expr *TypeConstraint = D->getPlaceholderTypeConstraint(); - Record.push_back(!!TypeConstraint); + Record.push_back(D->hasPlaceholderTypeConstraint()); if (D->isExpandedParameterPack()) Record.push_back(D->getNumExpansionTypes()); @@ -2025,8 +2024,9 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { // TemplateParmPosition. Record.push_back(D->getDepth()); Record.push_back(D->getPosition()); - if (TypeConstraint) - Record.AddStmt(TypeConstraint); + + if (D->hasPlaceholderTypeConstraint()) + Record.AddStmt(D->getPlaceholderTypeConstraint()); if (D->isExpandedParameterPack()) { for (unsigned I = 0, N = D->getNumExpansionTypes(); I != N; ++I) { diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 2687231d7820f..82738d3a8c88a 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -1335,11 +1335,15 @@ void ASTStmtWriter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { void ASTStmtWriter::VisitConvertVectorExpr(ConvertVectorExpr *E) { VisitExpr(E); + bool HasFPFeatures = E->hasStoredFPFeatures(); + CurrentPackingBits.addBit(HasFPFeatures); Record.AddSourceLocation(E->getBuiltinLoc()); Record.AddSourceLocation(E->getRParenLoc()); Record.AddTypeSourceInfo(E->getTypeSourceInfo()); Record.AddStmt(E->getSrcExpr()); Code = serialization::EXPR_CONVERT_VECTOR; + if (HasFPFeatures) + Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt()); } void ASTStmtWriter::VisitBlockExpr(BlockExpr *E) { diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp index 79f88553feb95..963f59831c8ed 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp @@ -61,6 +61,11 @@ class RawPtrRefMemberChecker Checker->visitRecordDecl(RD); return true; } + + bool VisitObjCContainerDecl(const ObjCContainerDecl *CD) override { + Checker->visitObjCDecl(CD); + return true; + } }; LocalVisitor visitor(this); @@ -87,6 +92,31 @@ class RawPtrRefMemberChecker } } + void visitObjCDecl(const ObjCContainerDecl *CD) const { + if (auto *ID = dyn_cast(CD)) { + for (auto *Ivar : ID->ivars()) + visitIvarDecl(CD, Ivar); + return; + } + if (auto *ID = dyn_cast(CD)) { + for (auto *Ivar : ID->ivars()) + visitIvarDecl(CD, Ivar); + return; + } + } + + void visitIvarDecl(const ObjCContainerDecl *CD, + const ObjCIvarDecl *Ivar) const { + const Type *IvarType = Ivar->getType().getTypePtrOrNull(); + if (!IvarType) + return; + if (auto *IvarCXXRD = IvarType->getPointeeCXXRecordDecl()) { + std::optional IsCompatible = isPtrCompatible(IvarCXXRD); + if (IsCompatible && *IsCompatible) + reportBug(Ivar, IvarType, IvarCXXRD, CD); + } + } + bool shouldSkipDecl(const RecordDecl *RD) const { if (!RD->isThisDeclarationADefinition()) return true; @@ -121,9 +151,10 @@ class RawPtrRefMemberChecker return false; } - void reportBug(const FieldDecl *Member, const Type *MemberType, + template + void reportBug(const DeclType *Member, const Type *MemberType, const CXXRecordDecl *MemberCXXRD, - const RecordDecl *ClassCXXRD) const { + const ParentDeclType *ClassCXXRD) const { assert(Member); assert(MemberType); assert(MemberCXXRD); @@ -131,7 +162,10 @@ class RawPtrRefMemberChecker SmallString<100> Buf; llvm::raw_svector_ostream Os(Buf); - Os << "Member variable "; + if (isa(ClassCXXRD)) + Os << "Instance variable "; + else + Os << "Member variable "; printQuotedName(Os, Member); Os << " in "; printQuotedQualifiedName(Os, ClassCXXRD); diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 6f65fa5c7cfd3..06501de64916a 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -626,6 +626,8 @@ namespace ThreeWayCmp { constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}} static_assert(k== 0, ""); + static_assert(__builtin_nanf("") <=> __builtin_nanf("") == -127, ""); + /// Pointers. constexpr int a[] = {1,2,3}; constexpr int b[] = {1,2,3}; diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 7e5f6ab8815ea..5be1bb944c18c 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -907,6 +907,12 @@ namespace IncompleteArray { return c; } static_assert(test4() == 12); + + + constexpr char *f(int n) { + return new char[n](); + } + static_assert((delete[] f(2), true)); } namespace NonConstexprArrayCtor { diff --git a/clang/test/AST/ast-dump-fpfeatures.cpp b/clang/test/AST/ast-dump-fpfeatures.cpp index cd00650db55cc..eeead3462c0ec 100644 --- a/clang/test/AST/ast-dump-fpfeatures.cpp +++ b/clang/test/AST/ast-dump-fpfeatures.cpp @@ -8,6 +8,17 @@ // RUN: | sed -e "s/ //" -e "s/ imported//" \ // RUN: | FileCheck --strict-whitespace %s +// CHECK-LABEL: FunctionDecl {{.*}} no_fpfeatures_func_01 'vector2float (vector2double)' +// CHECK: CompoundStmt {{.*\>$}} +// CHECK: ReturnStmt +// CHECK: ConvertVectorExpr {{.*}} 'vector2float':'__attribute__((__vector_size__(2 * sizeof(float)))) float'{{$}} + +typedef double vector2double __attribute__((__vector_size__(16))); +typedef float vector2float __attribute__((__vector_size__(8))); +vector2float no_fpfeatures_func_01(vector2double x) { + return __builtin_convertvector(x, vector2float); +} + float func_01(float x); template @@ -248,4 +259,14 @@ __attribute__((optnone)) T func_22(T x, T y) { float func_23(float x, float y) { return func_22(x, y); -} \ No newline at end of file +} + +// CHECK-LABEL: FunctionDecl {{.*}} func_24 'vector2float (vector2double)' +// CHECK: CompoundStmt {{.*}} FPContractMode=2 ConstRoundingMode=towardzero +// CHECK: ReturnStmt +// CHECK: ConvertVectorExpr {{.*}} FPContractMode=2 ConstRoundingMode=towardzero + +#pragma STDC FENV_ROUND FE_TOWARDZERO +vector2float func_24(vector2double x) { + return __builtin_convertvector(x, vector2float); +} diff --git a/clang/test/AST/const-fpfeatures.c b/clang/test/AST/const-fpfeatures.c index 8dc3221b0638a..787bb989dd4a2 100644 --- a/clang/test/AST/const-fpfeatures.c +++ b/clang/test/AST/const-fpfeatures.c @@ -22,6 +22,12 @@ float _Complex C1u = C0; float FLu = 0.1F; // CHECK: @FLu = {{.*}} float 0x3FB99999A0000000 +typedef float vector2float __attribute__((__vector_size__(8))); +typedef double vector2double __attribute__((__vector_size__(16))); +const vector2float V2Fu = {1.0F + 0x0.000001p0F, 1.0F + 0x0.000002p0F}; +vector2double V2Du = __builtin_convertvector(V2Fu, vector2double); +// CHECK: @V2Fu = {{.*}} <2 x float> splat (float 0x3FF0000020000000) +// CHECK: @V2Du = {{.*}} <2 x double> splat (double 0x3FF0000020000000) #pragma STDC FENV_ROUND FE_DOWNWARD @@ -41,3 +47,8 @@ float _Complex C1d = C0; float FLd = 0.1F; // CHECK: @FLd = {{.*}} float 0x3FB9999980000000 + +const vector2float V2Fd = {1.0F + 0x0.000001p0F, 1.0F + 0x0.000002p0F}; +vector2double V2Dd = __builtin_convertvector(V2Fd, vector2double); +// CHECK: @V2Fd = {{.*}} <2 x float> +// CHECK: @V2Dd = {{.*}} <2 x double> diff --git a/clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm b/clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm new file mode 100644 index 0000000000000..a9a9a367fb9f4 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm @@ -0,0 +1,35 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUncheckedPtrMemberChecker -verify %s + +#include "mock-types.h" + +__attribute__((objc_root_class)) +@interface NSObject ++ (instancetype) alloc; +- (instancetype) init; +- (instancetype)retain; +- (void)release; +@end + +void doSomeWork(); + +@interface SomeObjC : NSObject { + CheckedObj* _unchecked1; +// expected-warning@-1{{Instance variable '_unchecked1' in 'SomeObjC' is a raw pointer to CheckedPtr capable type 'CheckedObj'}} + CheckedPtr _counted1; + [[clang::suppress]] CheckedObj* _unchecked2; +} +- (void)doWork; +@end + +@implementation SomeObjC { + CheckedObj* _unchecked3; +// expected-warning@-1{{Instance variable '_unchecked3' in 'SomeObjC' is a raw pointer to CheckedPtr capable type 'CheckedObj'}} + CheckedPtr _counted2; + [[clang::suppress]] CheckedObj* _unchecked4; +} + +- (void)doWork { + doSomeWork(); +} + +@end diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm new file mode 100644 index 0000000000000..83b08a6841d26 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm @@ -0,0 +1,35 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.NoUncountedMemberChecker -verify %s + +#include "mock-types.h" + +__attribute__((objc_root_class)) +@interface NSObject ++ (instancetype) alloc; +- (instancetype) init; +- (instancetype)retain; +- (void)release; +@end + +void doSomeWork(); + +@interface SomeObjC : NSObject { + RefCountable* _uncounted1; +// expected-warning@-1{{Instance variable '_uncounted1' in 'SomeObjC' is a raw pointer to ref-countable type 'RefCountable'}} + RefPtr _counted1; + [[clang::suppress]] RefCountable* _uncounted2; +} +- (void)doWork; +@end + +@implementation SomeObjC { + RefCountable* _uncounted3; +// expected-warning@-1{{Instance variable '_uncounted3' in 'SomeObjC' is a raw pointer to ref-countable type 'RefCountable'}} + RefPtr _counted2; + [[clang::suppress]] RefCountable* _uncounted4; +} + +- (void)doWork { + doSomeWork(); +} + +@end diff --git a/clang/test/CIR/emit-actions.cpp b/clang/test/CIR/emit-actions.cpp new file mode 100644 index 0000000000000..94ddf23b34753 --- /dev/null +++ b/clang/test/CIR/emit-actions.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -S %s -o - | FileCheck %s -check-prefix=ASM + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm-bc %s -o %t.bc +// RUN: llvm-dis %t.bc -o - | FileCheck %s -check-prefix=BC + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-obj %s -o %t.o +// RUN: llvm-objdump -t %t.o | FileCheck %s -check-prefix=OBJ + +// TODO: Make this test target-independent +// REQUIRES: x86-registered-target + +int x = 1; + +// BC: @x = dso_local global i32 1 + +// ASM: x: +// ASM: .long 1 +// ASM: .size x, 4 + +// OBJ: .data +// OBJ-SAME: x diff --git a/clang/test/CodeGen/AArch64/sincos.c b/clang/test/CodeGen/AArch64/sincos.c index b77d98ceab486..736c0892ed741 100644 --- a/clang/test/CodeGen/AArch64/sincos.c +++ b/clang/test/CodeGen/AArch64/sincos.c @@ -1,6 +1,10 @@ // RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -O1 %s -o - | FileCheck --check-prefix=NO-MATH-ERRNO %s // RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -fmath-errno %s -o - | FileCheck --check-prefix=MATH-ERRNO %s +void sincos(double, double*, double*); +void sincosf(float, float*, float*); +void sincosl(long double, long double*, long double*); + // NO-MATH-ERRNO-LABEL: @sincos_f32 // NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}}) // NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0 @@ -12,6 +16,20 @@ // MATH-ERRNO: call void @sincosf( // void sincos_f32(float x, float* fp0, float* fp1) { + sincosf(x, fp0, fp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f32 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { float, float } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store float [[SIN]], ptr {{.*}}, align 4, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store float [[COS]], ptr {{.*}}, align 4, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f32 +// MATH-ERRNO: call void @sincosf( +// +void sincos_builtin_f32(float x, float* fp0, float* fp1) { __builtin_sincosf(x, fp0, fp1); } @@ -26,6 +44,20 @@ void sincos_f32(float x, float* fp0, float* fp1) { // MATH-ERRNO: call void @sincos( // void sincos_f64(double x, double* dp0, double* dp1) { + sincos(x, dp0, dp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f64 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { double, double } @llvm.sincos.f64(double {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { double, double } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { double, double } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store double [[SIN]], ptr {{.*}}, align 8, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store double [[COS]], ptr {{.*}}, align 8, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f64 +// MATH-ERRNO: call void @sincos( +// +void sincos_builtin_f64(double x, double* dp0, double* dp1) { __builtin_sincos(x, dp0, dp1); } @@ -40,5 +72,19 @@ void sincos_f64(double x, double* dp0, double* dp1) { // MATH-ERRNO: call void @sincosl( // void sincos_f128(long double x, long double* ldp0, long double* ldp1) { + sincosl(x, ldp0, ldp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f128 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { fp128, fp128 } @llvm.sincos.f128(fp128 {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store fp128 [[SIN]], ptr {{.*}}, align 16, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store fp128 [[COS]], ptr {{.*}}, align 16, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f128 +// MATH-ERRNO: call void @sincosl( +// +void sincos_builtin_f128(long double x, long double* ldp0, long double* ldp1) { __builtin_sincosl(x, ldp0, ldp1); } diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index d7bf7d57fba26..d5301b7bafd9c 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -38,6 +38,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO-NEXT: [[FREXP_F128_0:%.+]] = extractvalue { fp128, i32 } [[FREXP_F128]], 0 +// NO__ERRNO: [[MODF_F64:%.+]] = call { double, double } @llvm.modf.f64(double %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F64_FP:%.+]] = extractvalue { double, double } [[MODF_F64]], 0 +// NO__ERRNO-NEXT: [[MODF_F64_IP:%.+]] = extractvalue { double, double } [[MODF_F64]], 1 +// NO__ERRNO-NEXT: store double [[MODF_F64_IP]], ptr %{{.+}}, align 8 + +// NO__ERRNO: [[MODF_F32:%.+]] = call { float, float } @llvm.modf.f32(float %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F32_FP:%.+]] = extractvalue { float, float } [[MODF_F32]], 0 +// NO__ERRNO-NEXT: [[MODF_F32_IP:%.+]] = extractvalue { float, float } [[MODF_F32]], 1 +// NO__ERRNO-NEXT: store float [[MODF_F32_IP]], ptr %{{.+}}, align 4 + +// NO__ERRNO: [[MODF_F80:%.+]] = call { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80 %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F80_FP:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[MODF_F80]], 0 +// NO__ERRNO-NEXT: [[MODF_F80_IP:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[MODF_F80]], 1 +// NO__ERRNO-NEXT: store x86_fp80 [[MODF_F80_IP]], ptr %{{.+}}, align 16 + +// NO__ERRNO: call fp128 @modff128(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) + + // NO__ERRNO: [[SINCOS_F64:%.+]] = call { double, double } @llvm.sincos.f64(double %{{.+}}) // NO__ERRNO-NEXT: [[SINCOS_F64_0:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 0 // NO__ERRNO-NEXT: [[SINCOS_F64_1:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 1 @@ -139,13 +157,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_modf(f,d); __builtin_modff(f,fp); __builtin_modfl(f,l); __builtin_modff128(f,l); -// NO__ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] -// NO__ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] -// NO__ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] -// NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] +// NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] +// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] __builtin_nan(c); __builtin_nanf(c); __builtin_nanl(c); __builtin_nanf128(c); diff --git a/clang/test/CodeGen/aix-builtin-mapping.c b/clang/test/CodeGen/aix-builtin-mapping.c index a79218c6f1d8b..cc1cc1a44f32c 100644 --- a/clang/test/CodeGen/aix-builtin-mapping.c +++ b/clang/test/CodeGen/aix-builtin-mapping.c @@ -17,6 +17,6 @@ int main() returnValue = __builtin_ldexpl(1.0L, 1); } -// CHECK: %call = call double @modf(double noundef 1.000000e+00, ptr noundef %DummyLongDouble) #3 +// CHECK: %{{.+}} = call { double, double } @llvm.modf.f64(double 1.000000e+00) // CHECK: %{{.+}} = call { double, i32 } @llvm.frexp.f64.i32(double 0.000000e+00) // CHECK: %{{.+}} = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 1) diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index c116604288546..0cd81a77f5cc5 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -86,7 +86,7 @@ int div(int x, int y) { } // CHECK-LABEL: define dso_local i32 @null( -// CHECK-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // @@ -102,7 +102,7 @@ int div(int x, int y) { // CHECK-NEXT: ret i32 [[TMP2]] // // TR-LABEL: define dso_local i32 @null( -// TR-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// TR-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // TR-NEXT: [[ENTRY:.*:]] // TR-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // TR-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] @@ -116,7 +116,7 @@ int div(int x, int y) { // TR-NEXT: ret i32 [[TMP2]] // // REC-LABEL: define dso_local i32 @null( -// REC-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// REC-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // REC-NEXT: [[ENTRY:.*:]] // REC-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // REC-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] diff --git a/clang/test/CodeGen/builtin-attributes.c b/clang/test/CodeGen/builtin-attributes.c index e5b0faccfd23f..506b165fcf36e 100644 --- a/clang/test/CodeGen/builtin-attributes.c +++ b/clang/test/CodeGen/builtin-attributes.c @@ -24,6 +24,11 @@ char* f2(char* a, char* b) { return __builtin_strstr(a, b); } +// Note: Use asm label to disable intrinsic lowering of modf. +double modf(double x, double*) asm("modf"); +float modff(float x, float*) asm("modff"); +long double modfl(long double x, long double*) asm("modfl"); + // frexp is NOT readnone. It writes to its pointer argument. // // CHECK: f3 @@ -55,9 +60,9 @@ int f3(double x) { frexp(x, &e); frexpf(x, &e); frexpl(x, &e); - __builtin_modf(x, &e); - __builtin_modff(x, &e); - __builtin_modfl(x, &e); + modf(x, &e); + modff(x, &e); + modfl(x, &e); __builtin_remquo(x, x, &e); __builtin_remquof(x, x, &e); __builtin_remquol(x, x, &e); diff --git a/clang/test/CodeGen/math-builtins-long.c b/clang/test/CodeGen/math-builtins-long.c index 183349e0f0173..87e64a2eaa1c3 100644 --- a/clang/test/CodeGen/math-builtins-long.c +++ b/clang/test/CodeGen/math-builtins-long.c @@ -58,9 +58,9 @@ void foo(long double f, long double *l, int *i, const char *c) { // PPCF128: call fp128 @ldexpf128(fp128 noundef %{{.+}}, {{(signext)?.+}}) __builtin_ldexpl(f,f); - // F80: call x86_fp80 @modfl(x86_fp80 noundef %{{.+}}, ptr noundef %{{.+}}) - // PPC: call ppc_fp128 @modfl(ppc_fp128 noundef %{{.+}}, ptr noundef %{{.+}}) - // X86F128: call fp128 @modfl(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) + // F80: call { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80 %{{.+}}) + // PPC: call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %{{.+}}) + // X86F128: call { fp128, fp128 } @llvm.modf.f128(fp128 %{{.+}}) // PPCF128: call fp128 @modff128(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) __builtin_modfl(f,l); diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index bcc61c8f046b4..ad297828f48ed 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -83,12 +83,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { modf(f,d); modff(f,fp); modfl(f,l); - // NO__ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] - // NO__ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] - // NO__ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] + // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] // HAS_MAYTRAP: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] // HAS_MAYTRAP: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] @@ -660,6 +660,17 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // HAS_MAYTRAP: declare float @llvm.experimental.constrained.sinh.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.sinh.f80( +sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); + +// NO__ERRNO: declare { double, double } @llvm.sincos.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { float, float } @llvm.sincos.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] sqrt(f); sqrtf(f); sqrtl(f); diff --git a/clang/test/CodeGen/pragma-fenv_access.c b/clang/test/CodeGen/pragma-fenv_access.c index afca115ed08d1..347e9670c4742 100644 --- a/clang/test/CodeGen/pragma-fenv_access.c +++ b/clang/test/CodeGen/pragma-fenv_access.c @@ -242,3 +242,12 @@ float func_20(float x, float y) { // CHECK-LABEL: @func_20 // STRICT: call float @llvm.experimental.constrained.fadd.f32(float {{.*}}, float {{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // DEFAULT: fadd float + +typedef double vector4double __attribute__((__vector_size__(32))); +typedef float vector4float __attribute__((__vector_size__(16))); +vector4float func_21(vector4double x) { + #pragma STDC FENV_ROUND FE_UPWARD + return __builtin_convertvector(x, vector4float); +} +// CHECK-LABEL: @func_21 +// STRICT: call <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(<4 x double> {{.*}}, metadata !"round.upward", metadata !"fpexcept.strict") diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-returns -munsafe-fp-atomics \ // RUN: | FileCheck -check-prefix=UNSAFE %s diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp index 3662a270713b6..83daf57be22ff 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s -// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } @@ -22,12 +22,12 @@ // CHECK: declare ptr @__dynamic_cast(ptr, ptr, ptr, i64) local_unnamed_addr -// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned %b) local_unnamed_addr // CHECK-NEXT: entry // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } -// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly captures(address_is_null, ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %b, null // CHECK-NEXT: br i1 [[isnull]], label %[[dynamic_cast_end:[a-z0-9._]+]], label %[[dynamic_cast_notnull:[a-z0-9._]+]] diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp index 2a838708ca231..c471e5dbd7b33 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp @@ -24,7 +24,7 @@ // CHECK-NEXT: ret ptr @_ZTS1A // CHECK-NEXT: } -// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly captures(address_is_null) %a) local_unnamed_addr +// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly %a) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %a, null // CHECK-NEXT: br i1 [[isnull]], label %[[bad_typeid:[a-z0-9._]+]], label %[[end:[a-z0-9.+]+]] diff --git a/clang/test/CodeGenHLSL/builtins/and.hlsl b/clang/test/CodeGenHLSL/builtins/and.hlsl new file mode 100644 index 0000000000000..b77889cd9ae70 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/and.hlsl @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -O1 -o - | FileCheck %s + +// CHECK-LABEL: define noundef i1 @_Z15test_and_scalarbb( +// CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and i1 [[X]], [[Y]] +// CHECK-NEXT: ret i1 [[HLSL_AND]] +// +bool test_and_scalar(bool x, bool y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_( +// CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <2 x i1> [[X]], [[Y]] +// CHECK-NEXT: ret <2 x i1> [[HLSL_AND]] +// +bool2 test_and_bool2(bool2 x, bool2 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_( +// CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <3 x i1> [[X]], [[Y]] +// CHECK-NEXT: ret <3 x i1> [[HLSL_AND]] +// +bool3 test_and_bool3(bool3 x, bool3 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_( +// CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <4 x i1> [[X]], [[Y]] +// CHECK-NEXT: ret <4 x i1> [[HLSL_AND]] +// +bool4 test_and_bool4(bool4 x, bool4 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <4 x i1> @_Z13test_and_int4Dv4_iS_( +// CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer +// CHECK-NEXT: [[TOBOOL1:%.*]] = icmp ne <4 x i32> [[Y]], zeroinitializer +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <4 x i1> [[TOBOOL]], [[TOBOOL1]] +// CHECK-NEXT: ret <4 x i1> [[HLSL_AND]] +// +bool4 test_and_int4(int4 x, int4 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <4 x i1> @_Z15test_and_float4Dv4_fS_( +// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TOBOOL:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[X]], zeroinitializer +// CHECK-NEXT: [[TOBOOL1:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[Y]], zeroinitializer +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <4 x i1> [[TOBOOL]], [[TOBOOL1]] +// CHECK-NEXT: ret <4 x i1> [[HLSL_AND]] +// +bool4 test_and_float4(float4 x, float4 y) { + return and(x, y); +} diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl index 62fd20c4d1414..0aadaad2dca5c 100644 --- a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl +++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl @@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t getBuffer(void *p) { } // CHECK-LABEL: define {{[^@]+}}@consumeBufferPtr -// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5)) // CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] @@ -39,7 +39,7 @@ void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) { } // CHECK-LABEL: define {{[^@]+}}@test -// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 633f1dec5e370..d12dcead6fadf 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -29,8 +29,6 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s -// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s -// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s @@ -85,8 +83,6 @@ // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" -// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/as_type.cl b/clang/test/CodeGenOpenCL/as_type.cl index 2c6cdc3810b4d..1fe26fbeafdb4 100644 --- a/clang/test/CodeGenOpenCL/as_type.cl +++ b/clang/test/CodeGenOpenCL/as_type.cl @@ -67,7 +67,7 @@ int3 f8(char16 x) { return __builtin_astype(x, int3); } -//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone captures(ret: address, provenance) %[[x:.*]]) +//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone %[[x:.*]]) //CHECK: %[[cast:.*]] ={{.*}} addrspacecast ptr %[[x]] to ptr addrspace(1) //CHECK: ret ptr addrspace(1) %[[cast]] global int* addr_cast(int *x) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index 6593a8de566f6..f300b05fe798a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl similarity index 98% rename from clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl index a2f14c652c828..789f6e07240d7 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // REQUIRES: amdgpu-registered-target typedef unsigned int u32; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl index 521121f5e7e54..c91cf158948b9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl @@ -2,7 +2,7 @@ // RUN: -verify -o - %s // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -emit-llvm \ // RUN: -verify -o - %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 -emit-llvm \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 -emit-llvm \ // RUN: -verify -o - %s // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm \ // RUN: -verify -o - %s diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl index 45d2fa18efd53..b3367202f824e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90c -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index 00346baa6ff84..79083c3c5f0f9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -1,7 +1,7 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -DMFMA_GFX942_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX942 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950 #pragma OPENCL EXTENSION cl_khr_fp64:enable @@ -226,189 +226,189 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c) #endif // MFMA_GFX90A_TESTS -#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) -// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8 -// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) +#if defined(MFMA_GFX942_TESTS) || defined(MFMA_GFX950_TESTS) +// CHECK-GFX942-LABEL: @test_mfma_i32_16x16x32_i8 +// CHECK-GFX942: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c) { *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8 -// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_i32_32x32x16_i8 +// CHECK-GFX942: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c) { *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x8_xf32 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x4_xf32 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x32_f16 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x16_f16 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x32_bf16 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x16_bf16 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8 -// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_i32_16x16x64_i8 +// CHECK-GFX942: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx) { *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8 -// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_i32_32x32x32_i8 +// CHECK-GFX942: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx) { *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0); } -#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) +#endif // defined(MFMA_GFX942_TESTS) || defined(MFMA_GFX950_TESTS) #ifdef MFMA_GFX950_TESTS diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl similarity index 84% rename from clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl index 832d7df00db14..24d05fe3a8525 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl @@ -1,8 +1,8 @@ -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ // RUN: %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ -// RUN: -S -o - %s | FileCheck -check-prefix=GFX940 %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX942 %s // REQUIRES: amdgpu-registered-target @@ -12,8 +12,8 @@ typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: test_flat_add_f32 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} -// GFX940-LABEL: test_flat_add_f32 -// GFX940: flat_atomic_add_f32 +// GFX942-LABEL: test_flat_add_f32 +// GFX942: flat_atomic_add_f32 half2 test_flat_add_f32(__generic float *addr, float x) { return __builtin_amdgcn_flat_atomic_fadd_f32(addr, x); } @@ -21,8 +21,8 @@ half2 test_flat_add_f32(__generic float *addr, float x) { // CHECK-LABEL: test_flat_add_2f16 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} -// GFX940-LABEL: test_flat_add_2f16 -// GFX940: flat_atomic_pk_add_f16 +// GFX942-LABEL: test_flat_add_2f16 +// GFX942: flat_atomic_pk_add_f16 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); } @@ -32,8 +32,8 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_flat_add_2bf16 -// GFX940: flat_atomic_pk_add_bf16 +// GFX942-LABEL: test_flat_add_2bf16 +// GFX942: flat_atomic_pk_add_bf16 short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); } @@ -43,8 +43,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_global_add_2bf16 -// GFX940: global_atomic_pk_add_bf16 +// GFX942-LABEL: test_global_add_2bf16 +// GFX942: global_atomic_pk_add_bf16 short2 test_global_add_2bf16(__global short2 *addr, short2 x) { return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); } @@ -55,24 +55,24 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_local_add_2bf16 -// GFX940: ds_pk_add_rtn_bf16 +// GFX942-LABEL: test_local_add_2bf16 +// GFX942: ds_pk_add_rtn_bf16 short2 test_local_add_2bf16(__local short2 *addr, short2 x) { return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); } // CHECK-LABEL: test_local_add_2f16 // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 -// GFX940-LABEL: test_local_add_2f16 -// GFX940: ds_pk_add_rtn_f16 +// GFX942-LABEL: test_local_add_2f16 +// GFX942: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); } // CHECK-LABEL: test_local_add_2f16_noret // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 -// GFX940-LABEL: test_local_add_2f16_noret -// GFX940: ds_pk_add_f16 +// GFX942-LABEL: test_local_add_2f16_noret +// GFX942: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); } diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c index 97303510d6881..447ee4bd3a6f9 100644 --- a/clang/test/Driver/aarch64-mcpu.c +++ b/clang/test/Driver/aarch64-mcpu.c @@ -92,7 +92,7 @@ // COBALT-100: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2" // RUN: %clang --target=aarch64 -mcpu=grace -### -c %s 2>&1 | FileCheck -check-prefix=GRACE %s -// GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2" +// GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "grace" // ================== Check whether -mcpu and -mtune accept mixed-case values. // RUN: %clang --target=aarch64 -mcpu=Cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CASE-INSENSITIVE-CA53 %s diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index d97b2ddb1fc66..35dc190761ca4 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -107,8 +107,6 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c -DFAMILY=GFX9 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index 7c34d3ec6c63a..ad5fd8ebaa6a6 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -92,8 +92,6 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefix=GFX909 %s // RUN: %clang -### -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A %s // RUN: %clang -### -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefix=GFX90C %s -// RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s -// RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s // RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s @@ -148,8 +146,6 @@ // GFX909: "-target-cpu" "gfx909" // GFX90A: "-target-cpu" "gfx90a" // GFX90C: "-target-cpu" "gfx90c" -// GFX940: "-target-cpu" "gfx940" -// GFX941: "-target-cpu" "gfx941" // GFX942: "-target-cpu" "gfx942" // GFX950: "-target-cpu" "gfx950" // GFX1010: "-target-cpu" "gfx1010" diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c index 9bf8a8137926d..726bc26a64edd 100644 --- a/clang/test/Driver/cl-link.c +++ b/clang/test/Driver/cl-link.c @@ -71,3 +71,6 @@ // RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -fuse-ld=lld -### -fsanitize=address 2>&1 | FileCheck --check-prefix=INFER-LLD %s // INFER-LLD: lld-link // INFER-LLD-NOT: INFERASANLIBS + +// RUN: %clang_cl --target=x86_64-unknown-windows-msvc /Tc%s -flto -fuse-ld=lld -### -fprofile-sample-use=%S/Inputs/file.prof 2>&1 | FileCheck -check-prefix=CHECK-SAMPLE-PROFILE %s +// CHECK-SAMPLE-PROFILE: "-lto-sample-profile:{{.*}}/file.prof" diff --git a/clang/test/Driver/cuda-bad-arch.cu b/clang/test/Driver/cuda-bad-arch.cu index 8c8c5c3401329..85231a5b9705a 100644 --- a/clang/test/Driver/cuda-bad-arch.cu +++ b/clang/test/Driver/cuda-bad-arch.cu @@ -23,7 +23,7 @@ // RUN: | FileCheck -check-prefix OK %s // RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx90a -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s -// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx940 -c %s 2>&1 \ +// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx942 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s // We don't allow using NVPTX/AMDGCN for host compilation. diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip index 3b3afba0b18ca..bd93f9985a774 100644 --- a/clang/test/Driver/hip-macros.hip +++ b/clang/test/Driver/hip-macros.hip @@ -49,15 +49,13 @@ // RUN: %s 2>&1 | FileCheck --check-prefixes=IMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx1100 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=IMAGE,NOWARN %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s -// RUN: %clang -E -dM --offload-arch=gfx941 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx1100 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -Xclang -target-feature -Xclang "-image-insts" %s 2>&1 | FileCheck --check-prefixes=IMAGE,WARN %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -Xclang -target-feature -Xclang "+image-insts" %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,WARN %s // NOWARN-NOT: warning // WARN: warning: feature flag '{{[+|-]}}image-insts' is ignored since the feature is read only [-Winvalid-command-line-argument] @@ -68,9 +66,9 @@ // RUN: %clang -E -dM --offload-arch=gfx1100 -nogpuinc -nogpulib \ // RUN: -fgpu-default-stream=per-thread %s 2>&1 | FileCheck --check-prefixes=PTS %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -fgpu-default-stream=legacy %s 2>&1 | FileCheck --check-prefixes=NOPTS %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOPTS %s // PTS-DAG: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__ 1 // PTS-DAG: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__ 1 diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-grace.c b/clang/test/Driver/print-enabled-extensions/aarch64-grace.c new file mode 100644 index 0000000000000..fde6aee468cdc --- /dev/null +++ b/clang/test/Driver/print-enabled-extensions/aarch64-grace.c @@ -0,0 +1,62 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang --target=aarch64 --print-enabled-extensions -mcpu=grace | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s + +// CHECK: Extensions enabled for the given AArch64 target +// CHECK-EMPTY: +// CHECK-NEXT: Architecture Feature(s) Description +// CHECK-NEXT: FEAT_AES, FEAT_PMULL Enable AES support +// CHECK-NEXT: FEAT_AMUv1 Enable Armv8.4-A Activity Monitors extension +// CHECK-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// CHECK-NEXT: FEAT_BF16 Enable BFloat16 Extension +// CHECK-NEXT: FEAT_BTI Enable Branch Target Identification +// CHECK-NEXT: FEAT_CCIDX Enable Armv8.3-A Extend of the CCSIDR number of sets +// CHECK-NEXT: FEAT_CRC32 Enable Armv8.0-A CRC-32 checksum instructions +// CHECK-NEXT: FEAT_CSV2_2 Enable architectural speculation restriction +// CHECK-NEXT: FEAT_DIT Enable Armv8.4-A Data Independent Timing instructions +// CHECK-NEXT: FEAT_DPB Enable Armv8.2-A data Cache Clean to Point of Persistence +// CHECK-NEXT: FEAT_DPB2 Enable Armv8.5-A Cache Clean to Point of Deep Persistence +// CHECK-NEXT: FEAT_DotProd Enable dot product support +// CHECK-NEXT: FEAT_ETE Enable Embedded Trace Extension +// CHECK-NEXT: FEAT_FCMA Enable Armv8.3-A Floating-point complex number support +// CHECK-NEXT: FEAT_FHM Enable FP16 FML instructions +// CHECK-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// CHECK-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// CHECK-NEXT: FEAT_FRINTTS Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int +// CHECK-NEXT: FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions +// CHECK-NEXT: FEAT_FlagM2 Enable alternative NZCV format for floating point comparisons +// CHECK-NEXT: FEAT_I8MM Enable Matrix Multiply Int8 Extension +// CHECK-NEXT: FEAT_JSCVT Enable Armv8.3-A JavaScript FP conversion instructions +// CHECK-NEXT: FEAT_LOR Enable Armv8.1-A Limited Ordering Regions extension +// CHECK-NEXT: FEAT_LRCPC Enable support for RCPC extension +// CHECK-NEXT: FEAT_LRCPC2 Enable Armv8.4-A RCPC instructions with Immediate Offsets +// CHECK-NEXT: FEAT_LSE Enable Armv8.1-A Large System Extension (LSE) atomic instructions +// CHECK-NEXT: FEAT_LSE2 Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules +// CHECK-NEXT: FEAT_MPAM Enable Armv8.4-A Memory system Partitioning and Monitoring extension +// CHECK-NEXT: FEAT_MTE, FEAT_MTE2 Enable Memory Tagging Extension +// CHECK-NEXT: FEAT_NV, FEAT_NV2 Enable Armv8.4-A Nested Virtualization Enchancement +// CHECK-NEXT: FEAT_PAN Enable Armv8.1-A Privileged Access-Never extension +// CHECK-NEXT: FEAT_PAN2 Enable Armv8.2-A PAN s1e1R and s1e1W Variants +// CHECK-NEXT: FEAT_PAuth Enable Armv8.3-A Pointer Authentication extension +// CHECK-NEXT: FEAT_PMUv3 Enable Armv8.0-A PMUv3 Performance Monitors extension +// CHECK-NEXT: FEAT_RAS, FEAT_RASv1p1 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions +// CHECK-NEXT: FEAT_RDM Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions +// CHECK-NEXT: FEAT_RNG Enable Random Number generation instructions +// CHECK-NEXT: FEAT_SB Enable Armv8.5-A Speculation Barrier +// CHECK-NEXT: FEAT_SEL2 Enable Armv8.4-A Secure Exception Level 2 extension +// CHECK-NEXT: FEAT_SHA1, FEAT_SHA256 Enable SHA1 and SHA256 support +// CHECK-NEXT: FEAT_SHA3, FEAT_SHA512 Enable SHA512 and SHA3 support +// CHECK-NEXT: FEAT_SM4, FEAT_SM3 Enable SM3 and SM4 support +// CHECK-NEXT: FEAT_SPE Enable Statistical Profiling extension +// CHECK-NEXT: FEAT_SPECRES Enable Armv8.5-A execution and data prediction invalidation instructions +// CHECK-NEXT: FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit +// CHECK-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// CHECK-NEXT: FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions +// CHECK-NEXT: FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable SVE AES and quadword SVE polynomial multiply instructions +// CHECK-NEXT: FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions +// CHECK-NEXT: FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions +// CHECK-NEXT: FEAT_SVE_SM4 Enable SM4 SVE2 instructions +// CHECK-NEXT: FEAT_TLBIOS, FEAT_TLBIRANGE Enable Armv8.4-A TLB Range and Maintenance instructions +// CHECK-NEXT: FEAT_TRBE Enable Trace Buffer Extension +// CHECK-NEXT: FEAT_TRF Enable Armv8.4-A Trace extension +// CHECK-NEXT: FEAT_UAO Enable Armv8.2-A UAO PState +// CHECK-NEXT: FEAT_VHE Enable Armv8.1-A Virtual Host extension \ No newline at end of file diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c index 642d2df211c21..9ef44b2bb403e 100644 --- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c +++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c @@ -45,8 +45,6 @@ // CHECK-SAME: {{^}}, gfx909 // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c -// CHECK-SAME: {{^}}, gfx940 -// CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 // CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx1010 diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c index 3afcdf8c9fe5c..06ef72878340f 100644 --- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c +++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c @@ -52,8 +52,6 @@ // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c // CHECK-SAME: {{^}}, gfx9-4-generic -// CHECK-SAME: {{^}}, gfx940 -// CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 // CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx10-1-generic diff --git a/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp b/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp new file mode 100644 index 0000000000000..73dff88e506b4 --- /dev/null +++ b/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp @@ -0,0 +1,55 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t + +// RUN: %clang_cc1 -std=c++20 mod.cppm -emit-module-interface -o mod.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++20 main.cpp -fmodule-file=mod=mod.pcm -verify -fallow-pcm-with-compiler-errors -fsyntax-only -ast-dump-all | FileCheck %s + +// RUN: %clang_cc1 -std=c++20 mod.cppm -emit-reduced-module-interface -o mod.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++20 main.cpp -fmodule-file=mod=mod.pcm -verify -fallow-pcm-with-compiler-errors -fsyntax-only -ast-dump-all | FileCheck %s + +//--- mod.cppm +export module mod; + +template +concept ReferenceOf = Q; + +// expected-error@+2 {{unknown type name 'AngleIsInvalidNow'}} +// expected-error@+1 {{constexpr variable 'angle' must be initialized by a constant expression}} +constexpr struct angle {AngleIsInvalidNow e;} angle; + +// expected-error@+1 {{non-type template argument is not a constant expression}} +template auto R, typename Rep> requires requires(Rep v) {cos(v);} +auto cos(const Rep& q); + +// expected-error@+1 {{non-type template argument is not a constant expression}} +template auto R, typename Rep> requires requires(Rep v) {tan(v);} +auto tan(const Rep& q); + +//--- main.cpp +// expected-no-diagnostics +import mod; + +// CHECK: |-FunctionTemplateDecl {{.*}} col:6 imported in mod hidden invalid cos +// CHECK-NEXT: | |-NonTypeTemplateParmDecl {{.*}} col:34 imported in mod hidden referenced invalid 'ReferenceOf auto' depth 0 index 0 R +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} col:46 imported in mod hidden referenced typename depth 0 index 1 Rep +// CHECK-NEXT: | |-RequiresExpr {{.*}} 'bool' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} col:73 imported in mod hidden referenced v 'Rep' +// CHECK-NEXT: | | `-SimpleRequirement {{.*}} dependent +// CHECK-NEXT: | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | |-UnresolvedLookupExpr {{.*}} '' lvalue (ADL) = 'cos' empty +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'Rep' lvalue ParmVar {{.*}} 'v' 'Rep' non_odr_use_unevaluated +// CHECK-NEXT: | `-FunctionDecl {{.*}} col:6 imported in mod hidden cos 'auto (const Rep &)' +// CHECK-NEXT: | `-ParmVarDecl {{.*}} col:21 imported in mod hidden q 'const Rep &' + +// CHECK: |-FunctionTemplateDecl {{.*}} col:6 imported in mod hidden invalid tan +// CHECK-NEXT: | |-NonTypeTemplateParmDecl {{.*}} col:34 imported in mod hidden referenced invalid 'ReferenceOf auto' depth 0 index 0 R +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} col:46 imported in mod hidden referenced typename depth 0 index 1 Rep +// CHECK-NEXT: | |-RequiresExpr {{.*}} 'bool' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} col:73 imported in mod hidden referenced v 'Rep' +// CHECK-NEXT: | | `-SimpleRequirement {{.*}} dependent +// CHECK-NEXT: | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | |-UnresolvedLookupExpr {{.*}} '' lvalue (ADL) = 'tan' empty +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'Rep' lvalue ParmVar {{.*}} 'v' 'Rep' non_odr_use_unevaluated +// CHECK-NEXT: | `-FunctionDecl {{.*}} col:6 imported in mod hidden tan 'auto (const Rep &)' +// CHECK-NEXT: | `-ParmVarDecl {{.*}} col:21 imported in mod hidden q 'const Rep &' diff --git a/clang/test/Preprocessor/riscv-cf-protection-return.c b/clang/test/Preprocessor/riscv-cf-protection-return.c index 3a93a88fa6839..a4cbaa1edf68c 100644 --- a/clang/test/Preprocessor/riscv-cf-protection-return.c +++ b/clang/test/Preprocessor/riscv-cf-protection-return.c @@ -40,5 +40,7 @@ // RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ // RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s +// SHSTK-MACRO-NOT: __CET__ // SHSTK-MACRO: __riscv_shadow_stack 1{{$}} +// SHSTK-MACRO-NOT: __CET__ // NO-MACRO-NOT: __riscv_shadow_stack diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp index 62e1da565f2b5..0f10dad3937ba 100644 --- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp +++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp @@ -218,3 +218,18 @@ auto X = [] () { static_assert(sizeof...(pack3) == 5); }; } // namespace + +namespace GH125165 { + +template +auto f(auto t) { + const auto& [...pack] = t; + // expected-error@-1 {{cannot decompose non-class, non-array type 'char const'}} + (pack, ...); +}; + +void g() { + f('x'); // expected-note {{in instantiation}} +} + +} diff --git a/clang/test/SemaHLSL/BuiltIns/and-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/and-errors.hlsl new file mode 100644 index 0000000000000..0a99feb023d73 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/and-errors.hlsl @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -O1 -verify + +bool test_too_few_arg(bool a) { + return __builtin_hlsl_and(a); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} +} + +bool test_too_many_arg(bool a) { + return __builtin_hlsl_and(a, a, a); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +bool2 test_mismatched_args(bool2 a, bool3 b) { + return __builtin_hlsl_and(a, b); + // expected-error@-1 {{all arguments to '__builtin_hlsl_and' must have the same type}} +} + +bool test_incorrect_type(int a) { + return __builtin_hlsl_and(a, a); + // expected-error@-1{{invalid operand of type 'int' where 'bool' or a vector of such type is required}} +} diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl new file mode 100644 index 0000000000000..1f813e7a350b1 --- /dev/null +++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -ast-dump | FileCheck %s + +typedef vector int8[2]; + +export void fn(int8 A) { + int8 a = {A}; +// CHECK-LABEL: VarDecl {{.*}} b 'int8':'vector[2]' cinit +// CHECK-NEXT: ArrayInitLoopExpr {{.*}} 'int8':'vector[2]' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'vector' lvalue +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' + int8 b = a; + +// CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector[2]' cinit +// CHECK-NEXT: ArrayInitLoopExpr {{.*}} 'int8':'vector[2]' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'vector' lvalue +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' + int8 c = A; +} + + + + diff --git a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl index 9417249383469..30591507b3260 100644 --- a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl +++ b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl @@ -4,7 +4,7 @@ export void cantCast() { int A[3] = {1,2,3}; int B[4] = {1,2,3,4}; B = (int[4])A; - // expected-error@-1 {{C-style cast from 'int *' to 'int[4]' is not allowed}} + // expected-error@-1 {{C-style cast from 'int[3]' to 'int[4]' is not allowed}} } struct S { diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl similarity index 99% rename from clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl index 0fc2304d51ce0..db0387e9878f2 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify -S -o - %s // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -verify -S -o - %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl index d1c134c604dfc..b40b1c841b453 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify -S -o - %s typedef float float4 __attribute__((ext_vector_type(4))); typedef float float16 __attribute__((ext_vector_type(16))); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl similarity index 81% rename from clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl index 7cf80f7c92677..0b3f692f33998 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify=gfx940,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx942,expected -o - %s // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s // REQUIRES: amdgpu-registered-target @@ -8,12 +8,12 @@ void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx942-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx942-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} } __attribute__((target("gfx950-insts"))) diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c index a9d8261bd03e7..0e7de8b98ea07 100644 --- a/clang/tools/c-index-test/c-index-test.c +++ b/clang/tools/c-index-test/c-index-test.c @@ -1213,28 +1213,36 @@ static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) { } } -static const char* GetCursorSource(CXCursor Cursor) { +static CXString createCXString(const char *CS) { + CXString Str; + Str.data = (const void *)CS; + Str.private_flags = 0; + return Str; +} + +static CXString duplicateCXString(const char *CS) { + CXString Str; + Str.data = strdup(CS); + Str.private_flags = 1; // CXS_Malloc + return Str; +} + +static CXString GetCursorSource(CXCursor Cursor) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); CXString source; CXFile file; + const char *b; + CXString result; clang_getExpansionLocation(Loc, &file, 0, 0, 0); source = clang_getFileName(file); if (!clang_getCString(source)) { clang_disposeString(source); - return ""; - } - else { - const char *b = basename(clang_getCString(source)); - clang_disposeString(source); - return b; + return createCXString(""); } -} - -static CXString createCXString(const char *CS) { - CXString Str; - Str.data = (const void *) CS; - Str.private_flags = 0; - return Str; + b = basename(clang_getCString(source)); + result = duplicateCXString(b); + clang_disposeString(source); + return result; } /******************************************************************************/ @@ -1357,9 +1365,12 @@ enum CXChildVisitResult FilteredPrintingVisitor(CXCursor Cursor, if (!Data->Filter || (Cursor.kind == *(enum CXCursorKind *)Data->Filter)) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); unsigned line, column; + CXString source; clang_getFileLocation(Loc, 0, &line, &column, 0); - printf("// %s: %s:%d:%d: ", FileCheckPrefix, - GetCursorSource(Cursor), line, column); + source = GetCursorSource(Cursor); + printf("// %s: %s:%d:%d: ", FileCheckPrefix, clang_getCString(source), line, + column); + clang_disposeString(source); PrintCursor(Cursor, Data->CommentSchemaFile); PrintCursorExtent(Cursor); if (clang_isDeclaration(Cursor.kind)) { @@ -1428,8 +1439,10 @@ static enum CXChildVisitResult FunctionScanVisitor(CXCursor Cursor, if (Ref.kind == CXCursor_NoDeclFound) { /* Nothing found here; that's fine. */ } else if (Ref.kind != CXCursor_FunctionDecl) { - printf("// %s: %s:%d:%d: ", FileCheckPrefix, GetCursorSource(Ref), - curLine, curColumn); + CXString CursorSource = GetCursorSource(Ref); + printf("// %s: %s:%d:%d: ", FileCheckPrefix, + clang_getCString(CursorSource), curLine, curColumn); + clang_disposeString(CursorSource); PrintCursor(Ref, Data->CommentSchemaFile); printf("\n"); } @@ -1451,11 +1464,15 @@ enum CXChildVisitResult USRVisitor(CXCursor C, CXCursor parent, if (!Data->Filter || (C.kind == *(enum CXCursorKind *)Data->Filter)) { CXString USR = clang_getCursorUSR(C); const char *cstr = clang_getCString(USR); + CXString CursorSource; if (!cstr || cstr[0] == '\0') { clang_disposeString(USR); return CXChildVisit_Recurse; } - printf("// %s: %s %s", FileCheckPrefix, GetCursorSource(C), cstr); + CursorSource = GetCursorSource(C); + printf("// %s: %s %s", FileCheckPrefix, clang_getCString(CursorSource), + cstr); + clang_disposeString(CursorSource); PrintCursorExtent(C); printf("\n"); diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt index 4aed086563ee9..0c1cdd9a1fb60 100644 --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -6,6 +6,12 @@ set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data") set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.") +option(CLANG_PGO_TRAINING_USE_LLVM_BUILD "Use LLVM build for generating PGO data" ON) + +llvm_canonicalize_cmake_booleans( + CLANG_PGO_TRAINING_USE_LLVM +) + if(LLVM_BUILD_INSTRUMENTED) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in diff --git a/clang/utils/perf-training/lit.cfg b/clang/utils/perf-training/lit.cfg index adefc7893ac44..3f6089b7139a7 100644 --- a/clang/utils/perf-training/lit.cfg +++ b/clang/utils/perf-training/lit.cfg @@ -27,6 +27,9 @@ config.clang = lit.util.which('clang', config.clang_tools_dir).replace('\\', '/' config.name = 'Clang Perf Training' config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test'] +if not config.use_llvm_build: + config.excludes = ['llvm-support'] + cc1_wrapper = '%s %s/perf-helper.py cc1' % (config.python_exe, config.perf_helper_dir) use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") diff --git a/clang/utils/perf-training/lit.site.cfg.in b/clang/utils/perf-training/lit.site.cfg.in index 9d279d552919a..da81ec21a28a6 100644 --- a/clang/utils/perf-training/lit.site.cfg.in +++ b/clang/utils/perf-training/lit.site.cfg.in @@ -11,6 +11,7 @@ config.python_exe = "@Python3_EXECUTABLE@" config.cmake_exe = "@CMAKE_COMMAND@" config.llvm_src_dir ="@CMAKE_SOURCE_DIR@" config.cmake_generator ="@CMAKE_GENERATOR@" +config.use_llvm_build = @CLANG_PGO_TRAINING_USE_LLVM_BUILD@ # Let the main config do the real work. lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/lit.cfg") diff --git a/flang-rt/lib/runtime/io-api-minimal.cpp b/flang-rt/lib/runtime/io-api-minimal.cpp index 8d8c9c6070b04..fdf7183ed5176 100644 --- a/flang-rt/lib/runtime/io-api-minimal.cpp +++ b/flang-rt/lib/runtime/io-api-minimal.cpp @@ -150,7 +150,8 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) { // Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency // on the version provided by libc++. -void std::__libcpp_verbose_abort(char const *format, ...) { +void std::__libcpp_verbose_abort(char const *format, ...) noexcept( + noexcept(std::__libcpp_verbose_abort(""))) { va_list list; va_start(list, format); std::vfprintf(stderr, format, list); diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 8e4f47d18535d..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" + "gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" - "gfx1152;gfx1153" + "gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index c6c2e29a420ea..5e76d4331f6de 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -45,6 +45,11 @@ A list of non-standard directives supported by Flang times if possible. When `n` is omitted, the compiler should attempt to fully unroll the loop. Some compilers accept an optional `=` before the `n` when `n` is present in the directive. Flang does not. +* `!dir$ unroll_and_jam [N]` control how many times a loop should be unrolled and + jammed. It must be placed immediately before a loop that follows. `N` is an optional + integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop + should not be unrolled at all. If `N` is omitted the optimizer will + selects the number of times to unroll the loop. # Directive Details diff --git a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h index 8d17e4e476d10..c71988d081dd0 100644 --- a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h +++ b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h @@ -211,14 +211,14 @@ struct AliasAnalysis { fir::AliasAnalysis::Source getSource(mlir::Value, bool getLastInstantiationPoint = false); + /// Return true, if `ty` is a reference type to a boxed + /// POINTER object or a raw fir::PointerType. + static bool isPointerReference(mlir::Type ty); + private: /// Return true, if `ty` is a reference type to an object of derived type /// that contains a component with POINTER attribute. static bool isRecordWithPointerComponent(mlir::Type ty); - - /// Return true, if `ty` is a reference type to a boxed - /// POINTER object or a raw fir::PointerType. - static bool isPointerReference(mlir::Type ty); }; inline bool operator==(const AliasAnalysis::Source::SourceOrigin &lhs, diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 21ee1d0517840..75c11301285b3 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -209,6 +209,7 @@ class ParseTreeDumper { NODE(CompilerDirective, Unrecognized) NODE(CompilerDirective, VectorAlways) NODE(CompilerDirective, Unroll) + NODE(CompilerDirective, UnrollAndJam) NODE(parser, ComplexLiteralConstant) NODE(parser, ComplexPart) NODE(parser, ComponentArraySpec) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index c3a02fca5ade8..c2fa9a2228180 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -3349,6 +3349,8 @@ struct StmtFunctionStmt { // !DIR$ IGNORE_TKR [ [(tkrdmac...)] name ]... // !DIR$ LOOP COUNT (n1[, n2]...) // !DIR$ name[=value] [, name[=value]]... = can be : +// !DIR$ UNROLL [N] +// !DIR$ UNROLL_AND_JAM [N] // !DIR$ struct CompilerDirective { UNION_CLASS_BOILERPLATE(CompilerDirective); @@ -3371,10 +3373,13 @@ struct CompilerDirective { struct Unroll { WRAPPER_CLASS_BOILERPLATE(Unroll, std::optional); }; + struct UnrollAndJam { + WRAPPER_CLASS_BOILERPLATE(UnrollAndJam, std::optional); + }; EMPTY_CLASS(Unrecognized); CharBlock source; std::variant, LoopCount, std::list, - VectorAlways, std::list, Unroll, Unrecognized> + VectorAlways, std::list, Unroll, UnrollAndJam, Unrecognized> u; }; @@ -4553,8 +4558,8 @@ WRAPPER_CLASS(OmpReductionInitializerClause, Expr); struct OpenMPDeclareReductionConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPDeclareReductionConstruct); CharBlock source; - std::tuple, - OmpReductionCombiner, std::optional> + std::tuple, + std::optional> t; }; diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 7c217ce2f404c..1b24ed12e04f1 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2205,11 +2205,39 @@ class FirConverter : public Fortran::lower::AbstractConverter { /*full=*/fullUnrollAttr, {}, {}, {}); } + // Enabling unroll and jamming directive without a value. + // For directives with a value, if the value is greater than 1, + // force unrolling with the given factor. Otherwise, disable unrolling and + // jamming. + mlir::LLVM::LoopUnrollAndJamAttr + genLoopUnrollAndJamAttr(std::optional count) { + mlir::BoolAttr falseAttr = + mlir::BoolAttr::get(builder->getContext(), false); + mlir::BoolAttr trueAttr = mlir::BoolAttr::get(builder->getContext(), true); + mlir::IntegerAttr countAttr; + bool shouldUnroll = true; + if (count.has_value()) { + auto unrollingFactor = count.value(); + if (unrollingFactor == 0 || unrollingFactor == 1) { + shouldUnroll = false; + } else { + countAttr = + builder->getIntegerAttr(builder->getI64Type(), unrollingFactor); + } + } + + mlir::BoolAttr disableAttr = shouldUnroll ? falseAttr : trueAttr; + return mlir::LLVM::LoopUnrollAndJamAttr::get( + builder->getContext(), /*disable=*/disableAttr, /*count*/ countAttr, {}, + {}, {}, {}, {}); + } + void addLoopAnnotationAttr( IncrementLoopInfo &info, llvm::SmallVectorImpl &dirs) { mlir::LLVM::LoopVectorizeAttr va; mlir::LLVM::LoopUnrollAttr ua; + mlir::LLVM::LoopUnrollAndJamAttr uja; bool has_attrs = false; for (const auto *dir : dirs) { Fortran::common::visit( @@ -2226,12 +2254,16 @@ class FirConverter : public Fortran::lower::AbstractConverter { ua = genLoopUnrollAttr(u.v); has_attrs = true; }, + [&](const Fortran::parser::CompilerDirective::UnrollAndJam &u) { + uja = genLoopUnrollAndJamAttr(u.v); + has_attrs = true; + }, [&](const auto &) {}}, dir->u); } mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get( - builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}); + builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, + /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}); if (has_attrs) info.doLoop.setLoopAnnotationAttr(la); } @@ -2887,6 +2919,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { [&](const Fortran::parser::CompilerDirective::Unroll &) { attachDirectiveToLoop(dir, &eval); }, + [&](const Fortran::parser::CompilerDirective::UnrollAndJam &) { + attachDirectiveToLoop(dir, &eval); + }, [&](const auto &) {}}, dir.u); } diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp index 5a9a839330364..ed2700c42fc55 100644 --- a/flang/lib/Lower/Support/Utils.cpp +++ b/flang/lib/Lower/Support/Utils.cpp @@ -478,9 +478,47 @@ class IsEqualEvaluateExpr { return isEqual(x.proc(), y.proc()) && isEqual(x.arguments(), y.arguments()); } template + static bool isEqual(const Fortran::evaluate::ImpliedDo &x, + const Fortran::evaluate::ImpliedDo &y) { + return isEqual(x.values(), y.values()) && isEqual(x.lower(), y.lower()) && + isEqual(x.upper(), y.upper()) && isEqual(x.stride(), y.stride()); + } + template + static bool isEqual(const Fortran::evaluate::ArrayConstructorValues &x, + const Fortran::evaluate::ArrayConstructorValues &y) { + using Expr = Fortran::evaluate::Expr; + using ImpliedDo = Fortran::evaluate::ImpliedDo; + for (const auto &[xValue, yValue] : llvm::zip(x, y)) { + bool checkElement = Fortran::common::visit( + common::visitors{ + [&](const Expr &v, const Expr &w) { return isEqual(v, w); }, + [&](const ImpliedDo &v, const ImpliedDo &w) { + return isEqual(v, w); + }, + [&](const Expr &, const ImpliedDo &) { return false; }, + [&](const ImpliedDo &, const Expr &) { return false; }, + }, + xValue.u, yValue.u); + if (!checkElement) { + return false; + } + } + return true; + } + static bool isEqual(const Fortran::evaluate::SubscriptInteger &x, + const Fortran::evaluate::SubscriptInteger &y) { + return x == y; + } + template static bool isEqual(const Fortran::evaluate::ArrayConstructor &x, const Fortran::evaluate::ArrayConstructor &y) { - llvm::report_fatal_error("not implemented"); + bool checkCharacterType = true; + if constexpr (A::category == Fortran::common::TypeCategory::Character) { + checkCharacterType = isEqual(*x.LEN(), *y.LEN()); + } + using Base = Fortran::evaluate::ArrayConstructorValues; + return isEqual((Base)x, (Base)y) && + (x.GetType() == y.GetType() && checkCharacterType); } static bool isEqual(const Fortran::evaluate::ImpliedDoIndex &x, const Fortran::evaluate::ImpliedDoIndex &y) { diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 873758487ddd0..70fa18ad65b9b 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -54,10 +54,11 @@ static bool hasGlobalOpTargetAttr(mlir::Value v, fir::AddrOfOp op) { static mlir::Value getOriginalDef(mlir::Value v, fir::AliasAnalysis::Source::Attributes &attributes, - bool &isCapturedInInternalProcedure) { + bool &isCapturedInInternalProcedure, bool &approximateSource) { mlir::Operation *defOp; bool breakFromLoop = false; while (!breakFromLoop && (defOp = v.getDefiningOp())) { + mlir::Type ty = defOp->getResultTypes()[0]; llvm::TypeSwitch(defOp) .Case([&](fir::ConvertOp op) { v = op.getValue(); }) .Case([&](auto op) { @@ -67,6 +68,18 @@ getOriginalDef(mlir::Value v, isCapturedInInternalProcedure |= varIf.isCapturedInInternalProcedure(); }) + .Case([&](auto op) { + if (fir::AliasAnalysis::isPointerReference(ty)) + attributes.set(fir::AliasAnalysis::Attribute::Pointer); + v = op->getOperand(0); + approximateSource = true; + }) + .Case([&](hlfir::DesignateOp op) { + auto varIf = llvm::cast(defOp); + attributes |= getAttrsFromVariable(varIf); + v = op.getMemref(); + approximateSource = true; + }) .Default([&](auto op) { breakFromLoop = true; }); } return v; @@ -609,7 +622,8 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, attributes.set(Attribute::Pointer); auto def = getOriginalDef(op.getMemref(), attributes, - isCapturedInInternalProcedure); + isCapturedInInternalProcedure, + approximateSource); if (auto addrOfOp = def.template getDefiningOp()) { global = addrOfOp.getSymbol(); diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp index b5bcb53a12761..cfe9ecb29b0b7 100644 --- a/flang/lib/Parser/Fortran-parsers.cpp +++ b/flang/lib/Parser/Fortran-parsers.cpp @@ -1308,11 +1308,14 @@ constexpr auto vectorAlways{ "VECTOR ALWAYS" >> construct()}; constexpr auto unroll{ "UNROLL" >> construct(maybe(digitString64))}; +constexpr auto unrollAndJam{"UNROLL_AND_JAM" >> + construct(maybe(digitString64))}; TYPE_PARSER(beginDirective >> "DIR$ "_tok >> sourced((construct(ignore_tkr) || construct(loopCount) || construct(assumeAligned) || construct(vectorAlways) || + construct(unrollAndJam) || construct(unroll) || construct( many(construct( diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 2b6c77c08cc58..b39b8737b70c0 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -170,8 +170,8 @@ TYPE_PARSER(sourced( // TYPE_PARSER(construct(nonemptyList(Parser{}))) TYPE_PARSER( // - construct(Parser{}) || - construct(Parser{})) + construct(Parser{}) || + construct(Parser{})) TYPE_PARSER(construct( // Parser{}, @@ -1148,9 +1148,7 @@ TYPE_PARSER(construct( // 2.16 Declare Reduction Construct TYPE_PARSER(sourced(construct( verbatim("DECLARE REDUCTION"_tok), - "(" >> Parser{} / ":", - nonemptyList(Parser{}) / ":", - Parser{} / ")", + "(" >> indirect(Parser{}) / ")", maybe(Parser{})))) // declare-target with list diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index cd91fbe4ea5eb..6260a01897527 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -1851,6 +1851,10 @@ class UnparseVisitor { Word("!DIR$ UNROLL"); Walk(" ", unroll.v); }, + [&](const CompilerDirective::UnrollAndJam &unrollAndJam) { + Word("!DIR$ UNROLL_AND_JAM"); + Walk(" ", unrollAndJam.v); + }, [&](const CompilerDirective::Unrecognized &) { Word("!DIR$ "); Word(x.source.ToString()); @@ -2690,11 +2694,10 @@ class UnparseVisitor { BeginOpenMP(); Word("!$OMP DECLARE REDUCTION "); Put("("); - Walk(std::get(x.t)), Put(" : "); - Walk(std::get>(x.t), ","), Put(" : "); - Walk(std::get(x.t)); + Walk(std::get>(x.t)); Put(")"); Walk(std::get>(x.t)); + Put("\n"); EndOpenMP(); } diff --git a/flang/lib/Semantics/canonicalize-directives.cpp b/flang/lib/Semantics/canonicalize-directives.cpp index b27a27618808b..1a0a0d145b3e2 100644 --- a/flang/lib/Semantics/canonicalize-directives.cpp +++ b/flang/lib/Semantics/canonicalize-directives.cpp @@ -56,7 +56,8 @@ bool CanonicalizeDirectives( static bool IsExecutionDirective(const parser::CompilerDirective &dir) { return std::holds_alternative( dir.u) || - std::holds_alternative(dir.u); + std::holds_alternative(dir.u) || + std::holds_alternative(dir.u); } void CanonicalizationOfDirectives::Post(parser::SpecificationPart &spec) { @@ -115,6 +116,9 @@ void CanonicalizationOfDirectives::Post(parser::Block &block) { [&](parser::CompilerDirective::Unroll &) { CheckLoopDirective(*dir, block, it); }, + [&](parser::CompilerDirective::UnrollAndJam &) { + CheckLoopDirective(*dir, block, it); + }, [&](auto &) {}}, dir->u); } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 1d6fe6c8d4249..49e507feab580 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -3178,6 +3178,10 @@ bool OmpStructureChecker::CheckReductionOperator( const SourceName &realName{name->symbol->GetUltimate().name()}; valid = llvm::is_contained({"max", "min", "iand", "ior", "ieor"}, realName); + if (!valid) { + auto *misc{name->symbol->detailsIf()}; + valid = misc && misc->kind() == MiscDetails::Kind::ConstructName; + } } if (!valid) { context_.Say(source, diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 91a1b3061e1f9..38888a4dc1461 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -446,6 +446,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPDeclareMapperConstruct &); void Post(const parser::OpenMPDeclareMapperConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPDeclareReductionConstruct &); + void Post(const parser::OpenMPDeclareReductionConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPThreadprivate &); void Post(const parser::OpenMPThreadprivate &) { PopContext(); } @@ -1976,6 +1979,12 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclareMapperConstruct &x) { return true; } +bool OmpAttributeVisitor::Pre( + const parser::OpenMPDeclareReductionConstruct &x) { + PushContext(x.source, llvm::omp::Directive::OMPD_declare_reduction); + return true; +} + bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { PushContext(x.source, llvm::omp::Directive::OMPD_threadprivate); const auto &list{std::get(x.t)}; @@ -2309,7 +2318,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } if (Symbol * found{currScope().FindSymbol(name.source)}) { - if (found->test(semantics::Symbol::Flag::OmpThreadprivate)) + if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate)) return; } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index e64abe6b50e78..17a6665dfb6a5 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1482,6 +1482,15 @@ class OmpVisitor : public virtual DeclarationVisitor { return false; } + bool Pre(const parser::OpenMPDeclareReductionConstruct &x) { + AddOmpSourceRange(x.source); + parser::OmpClauseList emptyList{std::list{}}; + ProcessReductionSpecifier( + std::get>(x.t).value(), + emptyList); + Walk(std::get>(x.t)); + return false; + } bool Pre(const parser::OmpMapClause &); void Post(const parser::OmpBeginLoopDirective &) { @@ -1732,11 +1741,19 @@ void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec, void OmpVisitor::ProcessReductionSpecifier( const parser::OmpReductionSpecifier &spec, const parser::OmpClauseList &clauses) { + BeginDeclTypeSpec(); + const auto &id{std::get(spec.t)}; + if (auto procDes{std::get_if(&id.u)}) { + if (auto *name{std::get_if(&procDes->u)}) { + name->symbol = + &MakeSymbol(*name, MiscDetails{MiscDetails::Kind::ConstructName}); + } + } + EndDeclTypeSpec(); // Creating a new scope in case the combiner expression (or clauses) use // reerved identifiers, like "omp_in". This is a temporary solution until // we deal with these in a more thorough way. PushScope(Scope::Kind::OtherConstruct, nullptr); - Walk(std::get(spec.t)); Walk(std::get(spec.t)); Walk(std::get>(spec.t)); Walk(clauses); @@ -9535,7 +9552,8 @@ void ResolveNamesVisitor::Post(const parser::AssignedGotoStmt &x) { void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) { if (std::holds_alternative(x.u) || - std::holds_alternative(x.u)) { + std::holds_alternative(x.u) || + std::holds_alternative(x.u)) { return; } if (const auto *tkr{ diff --git a/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir b/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir new file mode 100644 index 0000000000000..de81841d9249d --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir @@ -0,0 +1,511 @@ +// Check aliasing with the address *in* (not *of*) a pointer component +// (hlfir.designate). +// +// Throughout this test, the ".fir" suffix on symbols indicates a version of the +// MLIR after convert-hlfir-to-fir. A key difference is that component access +// is via fir.coordinate_of instead of hlfir.designate. We would like alias +// analysis results to be the same in both versions. + +// RUN: fir-opt %s -split-input-file -o /dev/null --mlir-disable-threading \ +// RUN: -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' \ +// RUN: 2>&1 | FileCheck -match-full-lines %s + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// end module m +// subroutine test() +// use m +// real, target :: t +// real :: v +// type(ty) :: obj +// type(ty), target :: t_obj +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: NoAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: NoAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest() { + %0 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "obj", uniq_name = "_QFtestEobj"} + %1:2 = hlfir.declare %0 {test.ptr="obj", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "t_obj", fir.target, uniq_name = "_QFtestEt_obj"} + %5:2 = hlfir.declare %4 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7:2 = hlfir.declare %6 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %8 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %11 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %12 = fir.load %11 : !fir.ref>> + %13 = fir.box_addr %12 {test.ptr="obj%p1.tgt"}: (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %14 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %15 = hlfir.designate %1#0{"arr"} <%14> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %16 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %17 = fir.load %16 : !fir.ref>> + %repeat17 = fir.load %16 : !fir.ref>> + %18 = fir.box_addr %17 {test.ptr="obj%alloc.tgt"}: (!fir.box>) -> !fir.heap + %repeat18 = fir.box_addr %repeat17 {test.ptr="obj%alloc.tgt2"}: (!fir.box>) -> !fir.heap + %c2_1 = arith.constant 2 : index + %19 = fir.shape %c2_1 : (index) -> !fir.shape<1> + %c1_2 = arith.constant 1 : index + %20 = hlfir.designate %5#0{"arr"} <%19> (%c1_2) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %21 = hlfir.designate %5#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir() { + %0 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "obj", uniq_name = "_QFtestEobj"} + %1 = fir.declare %0 {test.ptr="obj.fir", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "t_obj", fir.target, uniq_name = "_QFtestEt_obj"} + %5 = fir.declare %4 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7 = fir.declare %6 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %8 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %9 = fir.coordinate_of %1, %8 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %10 = fir.load %9 : !fir.ref>> + %11 = fir.box_addr %10 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %12 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %13 = fir.coordinate_of %1, %12 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %14 = fir.load %13 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %18 = fir.coordinate_of %1, %17 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %19 = fir.array_coor %18(%16) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %20 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %21 = fir.coordinate_of %1, %20 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %repeat22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat23 = fir.box_addr %repeat22 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %24 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %25 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %26 = fir.coordinate_of %5, %25 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %27 = fir.array_coor %26(%24) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %28 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %29 = fir.coordinate_of %5, %28 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %30 = fir.load %29 : !fir.ref>> + %31 = fir.box_addr %30 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} + +// ----- + +// Repeat above test except composites are dummy args instead of locals. + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// end module m +// subroutine test(obj, t_obj) +// use m +// type(ty) :: obj +// type(ty), target :: t_obj +// real, target :: t +// real :: v +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// TODO: Thus, we expect all cases below to be NoAlias. However, target dummy +// args are currently indiscrimnately analyzed as MayAlias. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest(%arg0: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "obj"}, %arg1: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "t_obj", fir.target}) { + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {test.ptr="obj", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4:2 = hlfir.declare %arg1 dummy_scope %0 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %5 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %6:2 = hlfir.declare %5 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %7 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %8 = fir.load %7 : !fir.ref>> + %9 = fir.box_addr %8 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %10 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %11 = fir.load %10 : !fir.ref>> + %12 = fir.box_addr %11 {test.ptr="obj%p1.tgt"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %13 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %14 = hlfir.designate %1#0{"arr"} <%13> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %15 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %16 = fir.load %15 : !fir.ref>> + %repeat16 = fir.load %15 : !fir.ref>> + %17 = fir.box_addr %16 {test.ptr="obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + %repeat17 = fir.box_addr %repeat16 {test.ptr="obj%alloc.tgt2"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %18 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %19 = hlfir.designate %4#0{"arr"} <%18> (%c1_1) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %20 = hlfir.designate %4#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %21 = fir.load %20 : !fir.ref>> + %22 = fir.box_addr %21 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir(%arg0: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "obj"}, %arg1: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "t_obj", fir.target}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg0 dummy_scope %0 {test.ptr="obj.fir", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.declare %arg1 dummy_scope %0 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %6 = fir.declare %5 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %7 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %8 = fir.coordinate_of %1, %7 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %11 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %12 = fir.coordinate_of %1, %11 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %13 = fir.load %12 : !fir.ref>> + %14 = fir.box_addr %13 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %15 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %16 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %17 = fir.coordinate_of %1, %16 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %18 = fir.array_coor %17(%15) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %19 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %20 = fir.coordinate_of %1, %19 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %21 = fir.load %20 : !fir.ref>> + %repeat21 = fir.load %20 : !fir.ref>> + %22 = fir.box_addr %21 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat22 = fir.box_addr %repeat21 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %23 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %24 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %25 = fir.coordinate_of %4, %24 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %26 = fir.array_coor %25(%23) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %27 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %28 = fir.coordinate_of %4, %27 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %29 = fir.load %28 : !fir.ref>> + %30 = fir.box_addr %29 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} + +// ----- + +// Repeat above test except composites are globals. + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// type(ty) :: obj +// type(ty), target :: t_obj +// end module m +// subroutine test() +// use m +// real, target :: t +// real :: v +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: NoAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: NoAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest() { + %0 = fir.address_of(@_QMmEobj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %1:2 = hlfir.declare %0 {test.ptr="obj", uniq_name = "_QMmEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4 = fir.address_of(@_QMmEt_obj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5:2 = hlfir.declare %4 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QMmEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7:2 = hlfir.declare %6 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %8 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %12 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %13 = fir.load %12 : !fir.ref>> + %14 = fir.box_addr %13 {test.ptr="obj%p1.tgt"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = hlfir.designate %1#0{"arr"} <%16> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %19 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %20 = fir.load %19 : !fir.ref>> + %repeat20 = fir.load %19 : !fir.ref>> + %21 = fir.box_addr %20 {test.ptr="obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + %repeat21 = fir.box_addr %repeat20 {test.ptr="obj%alloc.tgt2"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %23 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %24 = hlfir.designate %5#0{"arr"} <%23> (%c1_1) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %26 = hlfir.designate %5#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %27 = fir.load %26 : !fir.ref>> + %28 = fir.box_addr %27 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir() { + %0 = fir.address_of(@_QMmEobj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %1 = fir.declare %0 {test.ptr="obj.fir", uniq_name = "_QMmEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.address_of(@_QMmEt_obj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5 = fir.declare %4 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QMmEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7 = fir.declare %6 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %8 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %9 = fir.coordinate_of %1, %8 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %10 = fir.load %9 : !fir.ref>> + %11 = fir.box_addr %10 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %12 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %13 = fir.coordinate_of %1, %12 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %14 = fir.load %13 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %18 = fir.coordinate_of %1, %17 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %19 = fir.array_coor %18(%16) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %20 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %21 = fir.coordinate_of %1, %20 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %repeat22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat23 = fir.box_addr %repeat22 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %24 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %25 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %26 = fir.coordinate_of %5, %25 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %27 = fir.array_coor %26(%24) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %28 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %29 = fir.coordinate_of %5, %28 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %30 = fir.load %29 : !fir.ref>> + %31 = fir.box_addr %30 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} diff --git a/flang/test/Integration/unroll_and_jam.f90 b/flang/test/Integration/unroll_and_jam.f90 new file mode 100644 index 0000000000000..771b7fb411855 --- /dev/null +++ b/flang/test/Integration/unroll_and_jam.f90 @@ -0,0 +1,48 @@ +! RUN: %flang_fc1 -emit-llvm -o - %s | FileCheck %s + +! CHECK-LABEL: unroll_and_jam_dir +subroutine unroll_and_jam_dir + integer :: a(10) + !dir$ unroll_and_jam 4 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir + +! CHECK-LABEL: unroll_and_jam_dir_0 +subroutine unroll_and_jam_dir_0 + integer :: a(10) + !dir$ unroll_and_jam 0 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_0 + +! CHECK-LABEL: unroll_and_jam_dir_1 +subroutine unroll_and_jam_dir_1 + integer :: a(10) + !dir$ unroll_and_jam 1 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_1 + +! CHECK-LABEL: unroll_and_jam_dir_no_factor +subroutine unroll_and_jam_dir_no_factor + integer :: a(10) + !dir$ unroll_and_jam + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_NO_FACTOR:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_no_factor + +! CHECK: ![[ANNOTATION]] = distinct !{![[ANNOTATION]], ![[UNROLL_AND_JAM:.*]], ![[UNROLL_AND_JAM_COUNT:.*]]} +! CHECK: ![[UNROLL_AND_JAM]] = !{!"llvm.loop.unroll_and_jam.enable"} +! CHECK: ![[UNROLL_AND_JAM_COUNT]] = !{!"llvm.loop.unroll_and_jam.count", i32 4} +! CHECK: ![[ANNOTATION_DISABLE]] = distinct !{![[ANNOTATION_DISABLE]], ![[UNROLL_AND_JAM2:.*]]} +! CHECK: ![[UNROLL_AND_JAM2]] = !{!"llvm.loop.unroll_and_jam.disable"} +! CHECK: ![[ANNOTATION_NO_FACTOR]] = distinct !{![[ANNOTATION_NO_FACTOR]], ![[UNROLL_AND_JAM]]} diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 index 7a7d28db8d6f5..db50c9ac8ee9d 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 @@ -1,10 +1,10 @@ ! This test checks lowering of OpenMP declare reduction Directive. -// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s +! RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s subroutine declare_red() integer :: my_var - // CHECK: not yet implemented: OpenMPDeclareReductionConstruct + !CHECK: not yet implemented: OpenMPDeclareReductionConstruct !$omp declare reduction (my_red : integer : omp_out = omp_in) initializer (omp_priv = 0) my_var = 0 end subroutine declare_red diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90 index 16dae9d5f301c..7d04745015faa 100644 --- a/flang/test/Lower/OpenMP/atomic-update.f90 +++ b/flang/test/Lower/OpenMP/atomic-update.f90 @@ -185,4 +185,19 @@ program OmpAtomicUpdate !$omp atomic update w = max(w,x,y,z) +!CHECK: %[[IMP_DO:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr { +!CHECK: ^bb0(%{{.*}}: index): +! [...] +!CHECK: %[[ADD_I1:.*]] = arith.addi {{.*}} : i32 +!CHECK: hlfir.yield_element %[[ADD_I1]] : i32 +!CHECK: } +! [...] +!CHECK: %[[SUM:.*]] = hlfir.sum %[[IMP_DO]] +!CHECK: omp.atomic.update %[[VAL_X_DECLARE]]#1 : !fir.ref { +!CHECK: ^bb0(%[[ARG0:.*]]: i32): +!CHECK: %[[ADD_I2:.*]] = arith.addi %[[ARG0]], %[[SUM]] : i32 +!CHECK: omp.yield(%[[ADD_I2]] : i32) +!CHECK: } + !$omp atomic update + x = x + sum([ (y+2, y=1, z) ]) end program OmpAtomicUpdate diff --git a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 index 7d02987c5eade..6201459bc42ca 100644 --- a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 @@ -15,8 +15,6 @@ !CHECK: %{{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[TP_VAL]]) fastmath : (!fir.ref, i32) -> i1 !CHECK: omp.terminator -!CHECK: fir.global internal @_QFsubEa : i32 - subroutine sub() integer, save:: a !$omp threadprivate(a) @@ -25,3 +23,55 @@ subroutine sub() !$omp end parallel end subroutine +!CHECK-LABEL: func.func @_QPsub_02() +subroutine sub_02() + integer, save :: a + !$omp threadprivate(a) + !CHECK: %[[ADDR_02:.*]] = fir.address_of(@_QFsub_02Ea) : !fir.ref + !CHECK: %[[DECL_02:.*]]:2 = hlfir.declare %[[ADDR_02]] {{{.*}} uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK: %[[TP_02:.*]] = omp.threadprivate %[[DECL_02]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_02:.*]]:2 = hlfir.declare %[[TP_02]] {{{.*}} uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + call sub_03 + !CHECK: fir.call @_QFsub_02Psub_03() fastmath : () -> () + !CHECK: return + +contains + + !CHECK-LABEL: func.func private @_QFsub_02Psub_03() + subroutine sub_03() + !CHECK: %[[ADDR_03:.*]] = fir.address_of(@_QFsub_02Ea) : !fir.ref + !CHECK: %[[DECL_03:.*]]:2 = hlfir.declare %[[ADDR_03]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK: %[[TP_03:.*]] = omp.threadprivate %[[DECL_03]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_03:.*]]:2 = hlfir.declare %[[TP_03]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !$omp parallel default(private) + !CHECK: omp.parallel + !CHECK: %[[TP_04:.*]] = omp.threadprivate %[[DECL_03]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_04:.*]]:2 = hlfir.declare %[[TP_04]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + print *, a + !CHECK: omp.terminator + !$omp end parallel + end subroutine +end subroutine + +module mod_01 + integer, save :: a + !CHECK: fir.global @_QMmod_01Ea : i32 + !$omp threadprivate(a) +end module + +!CHECK-LABEL: func.func @_QPsub_05() +subroutine sub_05() + use mod_01, only: a + !$omp parallel default(private) + !CHECK: omp.parallel { + !CHECK: %[[TP_05:.*]] = omp.threadprivate %{{.*}} : !fir.ref -> !fir.ref + !CHECK: %{{.*}} = hlfir.declare %[[TP_05]] {uniq_name = "_QMmod_01Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + print *, a + !CHECK: omp.terminator + !$omp end parallel +end subroutine + + +!CHECK: fir.global internal @_QFsubEa : i32 + +!CHECK: fir.global internal @_QFsub_02Ea : i32 diff --git a/flang/test/Lower/unroll_and_jam.f90 b/flang/test/Lower/unroll_and_jam.f90 new file mode 100644 index 0000000000000..afc5a7b6b271e --- /dev/null +++ b/flang/test/Lower/unroll_and_jam.f90 @@ -0,0 +1,34 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s + +! CHECK: #loop_unroll_and_jam = #llvm.loop_unroll_and_jam +! CHECK: #loop_unroll_and_jam1 = #llvm.loop_unroll_and_jam +! CHECK: #loop_annotation = #llvm.loop_annotation +! CHECK: #loop_annotation1 = #llvm.loop_annotation + +! CHECK-LABEL: unroll_and_jam_dir +subroutine unroll_and_jam_dir + integer :: a(10) + !dir$ unroll_and_jam + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation} + do i=1,10 + a(i)=i + end do + + !dir$ unroll_and_jam 2 + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation1} + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir + + +! CHECK-LABEL: intermediate_directive +subroutine intermediate_directive + integer :: a(10) + !dir$ unroll_and_jam + !dir$ unknown + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation} + do i=1,10 + a(i)=i + end do +end subroutine intermediate_directive diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 new file mode 100644 index 0000000000000..a2a3ef9f630ab --- /dev/null +++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 @@ -0,0 +1,21 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s +! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s +!CHECK-LABEL: program main +program main + integer :: my_var + !CHECK: !$OMP DECLARE REDUCTION (my_add_red:INTEGER: omp_out=omp_out+omp_in + !CHECK-NEXT: ) INITIALIZER(OMP_PRIV = 0_4) + + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + my_var = 0 + !$omp parallel reduction (my_add_red : my_var) num_threads(4) + my_var = omp_get_thread_num() + 1 + !$omp end parallel + print *, "sum of thread numbers is ", my_var +end program main + +!PARSE-TREE: OpenMPDeclareReductionConstruct +!PARSE-TREE: OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red' +!PARSE-TREE: DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec +!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out=omp_out+omp_in' +!PARSE-TREE: OmpReductionInitializerClause -> Expr = '0_4' diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90 index f372a9f533a35..d1e386a01dd4d 100644 --- a/flang/test/Parser/compiler-directives.f90 +++ b/flang/test/Parser/compiler-directives.f90 @@ -46,3 +46,14 @@ subroutine unroll do i=1,10 enddo end subroutine + +subroutine unroll_and_jam + !dir$ unroll_and_jam + ! CHECK: !DIR$ UNROLL_AND_JAM + do i=1,10 + enddo + !dir$ unroll_and_jam 2 + ! CHECK: !DIR$ UNROLL_AND_JAM 2 + do i=1,10 + enddo +end subroutine diff --git a/flang/test/Semantics/OpenMP/declarative-directive01.f90 b/flang/test/Semantics/OpenMP/declarative-directive01.f90 index 17dc50b70e542..e8bf605565fad 100644 --- a/flang/test/Semantics/OpenMP/declarative-directive01.f90 +++ b/flang/test/Semantics/OpenMP/declarative-directive01.f90 @@ -2,9 +2,6 @@ ! Check OpenMP declarative directives -!TODO: all internal errors -! enable declare-reduction example after name resolution - ! 2.4 requires subroutine requires_1(a) @@ -88,15 +85,14 @@ end module m2 ! 2.16 declare-reduction -! subroutine declare_red_1() -! use omp_lib -! integer :: my_var -! !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) -! my_var = 0 -! !$omp parallel reduction (my_add_red : my_var) num_threads(4) -! my_var = omp_get_thread_num() + 1 -! !$omp end parallel -! print *, "sum of thread numbers is ", my_var -! end subroutine declare_red_1 +subroutine declare_red_1() + integer :: my_var + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + my_var = 0 + !$omp parallel reduction (my_add_red : my_var) num_threads(4) + my_var = 1 + !$omp end parallel + print *, "sum of thread numbers is ", my_var +end subroutine declare_red_1 end diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90 new file mode 100644 index 0000000000000..8fee79dfc0b7b --- /dev/null +++ b/flang/test/Semantics/OpenMP/declare-reduction.f90 @@ -0,0 +1,11 @@ +! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s + +program main +!CHECK-LABEL: MainProgram scope: main + + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + +!CHECK: my_add_red: Misc ConstructName + +end program main + diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst index 1c1f9c9bfb0c6..f17f6287be313 100644 --- a/libc/docs/gpu/using.rst +++ b/libc/docs/gpu/using.rst @@ -44,7 +44,7 @@ this shouldn't be necessary. $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc - $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc + $> clang hip.hip --offload-arch=gfx942 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc This will automatically link in the needed function definitions if they were required by the user's application. Normally using the ``-fgpu-rdc`` option diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index 212e3c6a9446c..c6fd33a55532c 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -47,6 +47,11 @@ #define __NOEXCEPT throw() #endif +// This macro serves as a generic cast implementation for use in both C and C++, +// similar to `__BIONIC_CAST` in Android. +#undef __LLVM_LIBC_CAST +#define __LLVM_LIBC_CAST(cast, type, value) (cast(value)) + #else // not __cplusplus #undef __BEGIN_C_DECLS @@ -85,6 +90,9 @@ #undef _Returns_twice #define _Returns_twice __attribute__((returns_twice)) +#undef __LLVM_LIBC_CAST +#define __LLVM_LIBC_CAST(cast, type, value) ((type)(value)) + #endif // __cplusplus #endif // _LLVM_LIBC_COMMON_H diff --git a/libc/include/llvm-libc-macros/endian-macros.h b/libc/include/llvm-libc-macros/endian-macros.h index e1e105d50c1c6..52d95dc01cd83 100644 --- a/libc/include/llvm-libc-macros/endian-macros.h +++ b/libc/include/llvm-libc-macros/endian-macros.h @@ -20,27 +20,27 @@ #define htobe16(x) __builtin_bswap16((x)) #define htobe32(x) __builtin_bswap32((x)) #define htobe64(x) __builtin_bswap64((x)) -#define htole16(x) ((uint16_t)(x)) -#define htole32(x) ((uint32_t)(x)) -#define htole64(x) ((uint64_t)(x)) +#define htole16(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define htole32(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define htole64(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define be16toh(x) __builtin_bswap16((x)) #define be32toh(x) __builtin_bswap32((x)) #define be64toh(x) __builtin_bswap64((x)) -#define le16toh(x) ((uint16_t)(x)) -#define le32toh(x) ((uint32_t)(x)) -#define le64toh(x) ((uint64_t)(x)) +#define le16toh(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define le32toh(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define le64toh(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #else -#define htobe16(x) ((uint16_t)(x)) -#define htobe32(x) ((uint32_t)(x)) -#define htobe64(x) ((uint64_t)(x)) +#define htobe16(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define htobe32(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define htobe64(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define htole16(x) __builtin_bswap16((x)) #define htole32(x) __builtin_bswap32((x)) #define htole64(x) __builtin_bswap64((x)) -#define be16toh(x) ((uint16_t)(x)) -#define be32toh(x) ((uint32_t)(x)) -#define be64toh(x) ((uint64_t)(x)) +#define be16toh(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define be32toh(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define be64toh(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define le16toh(x) __builtin_bswap16((x)) #define le32toh(x) __builtin_bswap32((x)) #define le64toh(x) __builtin_bswap64((x)) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 5cefa8a264310..5a9a26c44f368 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -20,16 +20,12 @@ include( GNUInstallDirs ) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS amdgcn-amdhsa/lib/SOURCES; amdgcn/lib/SOURCES; - amdgcn-mesa3d/lib/SOURCES; amdgpu/lib/SOURCES; clspv/lib/SOURCES; - clspv64/lib/SOURCES; generic/lib/SOURCES; - ptx/lib/SOURCES; ptx-nvidiacl/lib/SOURCES; r600/lib/SOURCES; spirv/lib/SOURCES; - spirv64/lib/SOURCES; # CLC internal libraries clc/lib/generic/SOURCES; ) @@ -211,7 +207,7 @@ set( cayman_aliases aruba ) set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11 gfx602 gfx705 gfx805 - gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942 + gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx942 gfx1010 gfx1011 gfx1012 gfx1013 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 @@ -280,11 +276,6 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( opencl_dirs ) - if ( NOT ${ARCH} STREQUAL spirv AND NOT ${ARCH} STREQUAL spirv64 AND - NOT ${ARCH} STREQUAL clspv AND NOT ${ARCH} STREQUAL clspv64) - LIST( APPEND opencl_dirs generic ) - endif() - if( ${ARCH} STREQUAL r600 OR ${ARCH} STREQUAL amdgcn ) list( APPEND opencl_dirs amdgpu ) endif() @@ -302,8 +293,25 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( DARCH ${ARCH} ) endif() + # Append a variety of target- and triple-based directories to search, + # increasing in specificity. + list( APPEND opencl_dirs ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} ) + + # The 'generic' directory contains all of the generic implementations of the + # builtins. It is included first so it has the lowest search priority, + # allowing targets to override builtins based on file names found later in + # the list of search directories. + # CLC builds all builtins for all targets, so unconditionally prepend the + # 'generic' directory. + set( clc_dirs generic ${opencl_dirs} ) + # Some OpenCL targets don't build all builtins, in which case they don't want + # the 'generic' directory. Otherwise, prepend the 'generic' directory. + if ( NOT ARCH STREQUAL spirv AND NOT ARCH STREQUAL spirv64 AND + NOT ARCH STREQUAL clspv AND NOT ARCH STREQUAL clspv64) + list( PREPEND opencl_dirs generic ) + endif() + set( clc_lib_files ) - set( clc_dirs ${dirs} generic ) if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) set( clc_gen_files clc-clspv-convert.cl ) @@ -315,7 +323,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) clc_lib_files CLC_INTERNAL LIB_ROOT_DIR clc - DIRS ${clc_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${clc_dirs} ) set( opencl_lib_files ) @@ -334,7 +342,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) libclc_configure_lib_source( opencl_lib_files - DIRS ${opencl_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${opencl_dirs} ) foreach( d ${${t}_devices} ) diff --git a/libclc/clc/lib/generic/math/clc_nextafter.cl b/libclc/clc/lib/generic/math/clc_nextafter.cl index 58125485bf684..f77f3647d5985 100644 --- a/libclc/clc/lib/generic/math/clc_nextafter.cl +++ b/libclc/clc/lib/generic/math/clc_nextafter.cl @@ -1,5 +1,6 @@ #include #include +#include #include // This file provides OpenCL C implementations of __clc_nextafter for @@ -12,21 +13,30 @@ FLOAT_TYPE y) { \ const UINT_TYPE sign_bit = (UINT_TYPE)1 \ << (sizeof(INT_TYPE_SCALAR) * 8 - 1); \ - const UINT_TYPE sign_bit_mask = sign_bit - (UINT_TYPE)1; \ - INT_TYPE ix = CLC_AS_TYPE(INT_TYPE)(x); \ - UINT_TYPE ax = CLC_AS_TYPE(UINT_TYPE)(ix) & sign_bit_mask; \ - INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(sign_bit) - ix; \ - mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx : ix; \ - INT_TYPE iy = CLC_AS_TYPE(INT_TYPE)(y); \ - UINT_TYPE ay = CLC_AS_TYPE(UINT_TYPE)(iy) & sign_bit_mask; \ - INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(sign_bit) - iy; \ - my = iy < (INT_TYPE)0 ? my : iy; \ + UINT_TYPE ix = CLC_AS_TYPE(UINT_TYPE)(x); \ + FLOAT_TYPE absx = __clc_fabs(x); \ + UINT_TYPE mxu = sign_bit - ix; \ + INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(mxu); \ + mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx \ + : CLC_AS_TYPE(INT_TYPE)(ix); \ + UINT_TYPE iy = CLC_AS_TYPE(UINT_TYPE)(y); \ + FLOAT_TYPE absy = __clc_fabs(y); \ + UINT_TYPE myu = sign_bit - iy; \ + INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(myu); \ + my = CLC_AS_TYPE(INT_TYPE)(iy) < (INT_TYPE)0 ? my \ + : CLC_AS_TYPE(INT_TYPE)(iy); \ INT_TYPE t = mx + (mx < my ? (INT_TYPE)1 : (INT_TYPE)-1); \ - INT_TYPE r = CLC_AS_TYPE(INT_TYPE)(sign_bit) - t; \ - r = t < (INT_TYPE)0 ? r : t; \ + UINT_TYPE r = sign_bit - CLC_AS_TYPE(UINT_TYPE)(t); \ + r = (t < (INT_TYPE)0 || (t == (INT_TYPE)0 && mx < my)) \ + ? r \ + : CLC_AS_TYPE(UINT_TYPE)(t); \ r = __clc_isnan(x) ? ix : r; \ - r = __clc_isnan(y) ? CLC_AS_TYPE(INT_TYPE)(iy) : r; \ - r = ((ax | ay) == (UINT_TYPE)0 || ix == iy) ? iy : r; \ + r = __clc_isnan(y) ? iy : r; \ + r = ((CLC_AS_TYPE(UINT_TYPE)(absx) | CLC_AS_TYPE(UINT_TYPE)(absy)) == \ + (UINT_TYPE)0 || \ + ix == iy) \ + ? iy \ + : r; \ return CLC_AS_TYPE(FLOAT_TYPE)(r); \ } diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 40e31e0ba4f45..911559ff4bfa9 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -402,7 +402,8 @@ endfunction(add_libclc_builtin_set) # directory. If not provided, is set to '.'. # * DIRS ... # List of directories under LIB_ROOT_DIR to walk over searching for SOURCES -# files +# files. Directories earlier in the list have lower priority than +# subsequent ones. function(libclc_configure_lib_source LIB_FILE_LIST) cmake_parse_arguments(ARG "CLC_INTERNAL" @@ -417,7 +418,7 @@ function(libclc_configure_lib_source LIB_FILE_LIST) # Enumerate SOURCES* files set( source_list ) - foreach( l ${ARG_DIRS} ) + foreach( l IN LISTS ARG_DIRS ) foreach( s "SOURCES" "SOURCES_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}" ) if( ARG_CLC_INTERNAL ) file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/lib/${l}/${s} file_loc ) @@ -425,10 +426,10 @@ function(libclc_configure_lib_source LIB_FILE_LIST) file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/${l}/lib/${s} file_loc ) endif() file( TO_CMAKE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${file_loc} loc ) - # Prepend the location to give higher priority to - # specialized implementation + # Prepend the location to give higher priority to the specialized + # implementation if( EXISTS ${loc} ) - set( source_list ${file_loc} ${source_list} ) + list( PREPEND source_list ${file_loc} ) endif() endforeach() endforeach() diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl index 90c9fa426fec1..f0b5d3fdfbb1c 100644 --- a/libclc/generic/lib/math/ep_log.cl +++ b/libclc/generic/lib/math/ep_log.cl @@ -38,57 +38,57 @@ #define LF1 1.24999999978138668903e-02 #define LF2 2.23219810758559851206e-03 -_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2) -{ - // Computes natural log(x). Algorithm based on: - // Ping-Tak Peter Tang - // "Table-driven implementation of the logarithm function in IEEE - // floating-point arithmetic" - // ACM Transactions on Mathematical Software (TOMS) - // Volume 16, Issue 4 (December 1990) - int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; - - ulong ux = as_ulong(x); - ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); - int c = ux < IMPBIT_DP64; - ux = c ? uxs : ux; - int expadjust = c ? 60 : 0; - - // Store the exponent of x in xexp and put f into the range [0.5,1) - int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; - double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); - *xexp = near_one ? 0 : xexp1; - - double r = x - 1.0; - double u1 = MATH_DIVIDE(r, 2.0 + r); - double ru1 = -r * u1; - u1 = u1 + u1; - - int index = as_int2(ux).hi >> 13; - index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); - - double f1 = index * 0x1.0p-7; - double f2 = f - f1; - double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); - - double2 tv = USE_TABLE(ln_tbl, (index - 64)); - double z1 = tv.s0; - double q = tv.s1; - - z1 = near_one ? r : z1; - q = near_one ? 0.0 : q; - double u = near_one ? u1 : u2; - double v = u*u; - - double cc = near_one ? ru1 : u2; - - double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); - double z22 = fma(v, fma(v, LF2, LF1), LF0); - double z2 = near_one ? z21 : z22; - z2 = fma(u*v, z2, cc) + q; - - *r1 = z1; - *r2 = z2; +_CLC_DEF void __clc_ep_log(double x, private int *xexp, private double *r1, + private double *r2) { + // Computes natural log(x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; + + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + // Store the exponent of x in xexp and put f into the range [0.5,1) + int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + *xexp = near_one ? 0 : xexp1; + + double r = x - 1.0; + double u1 = MATH_DIVIDE(r, 2.0 + r); + double ru1 = -r * u1; + u1 = u1 + u1; + + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + + double2 tv = USE_TABLE(ln_tbl, (index - 64)); + double z1 = tv.s0; + double q = tv.s1; + + z1 = near_one ? r : z1; + q = near_one ? 0.0 : q; + double u = near_one ? u1 : u2; + double v = u * u; + + double cc = near_one ? ru1 : u2; + + double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); + double z22 = fma(v, fma(v, LF2, LF1), LF0); + double z2 = near_one ? z21 : z22; + z2 = fma(u * v, z2, cc) + q; + + *r1 = z1; + *r2 = z2; } #endif diff --git a/libclc/generic/lib/math/ep_log.h b/libclc/generic/lib/math/ep_log.h index 414e6231f7fd6..3176cfe5b42ce 100644 --- a/libclc/generic/lib/math/ep_log.h +++ b/libclc/generic/lib/math/ep_log.h @@ -26,6 +26,7 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL void __clc_ep_log(double x, int *xexp, double *r1, double *r2); +_CLC_DECL void __clc_ep_log(double x, private int *xexp, private double *r1, + private double *r2); #endif diff --git a/libclc/generic/lib/math/modf.inc b/libclc/generic/lib/math/modf.inc index 1ffc6d9e851bd..ff7ef30dd42f8 100644 --- a/libclc/generic/lib/math/modf.inc +++ b/libclc/generic/lib/math/modf.inc @@ -28,18 +28,20 @@ #define ZERO 0.0h #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, + private __CLC_GENTYPE *iptr) { *iptr = trunc(x); return copysign(isinf(x) ? ZERO : x - *iptr, x); } -#define MODF_DEF(addrspace) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ - __CLC_GENTYPE private_iptr; \ - __CLC_GENTYPE ret = modf(x, &private_iptr); \ - *iptr = private_iptr; \ - return ret; \ -} +#define MODF_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, \ + addrspace __CLC_GENTYPE *iptr) { \ + __CLC_GENTYPE private_iptr; \ + __CLC_GENTYPE ret = modf(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ + } MODF_DEF(local); MODF_DEF(global); diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl index 22f2bf61bf27d..441bad2be432f 100644 --- a/libclc/generic/lib/math/sincos_helpers.cl +++ b/libclc/generic/lib/math/sincos_helpers.cl @@ -119,8 +119,8 @@ _CLC_DEF float __clc_tanf_piby4(float x, int regn) { return regn & 1 ? tr : t; } -_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, - float bt) { +_CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a, + float b, float bh, float bt) { if (HAVE_HW_FMA32()) { float ph = a * b; *hi = ph; @@ -136,7 +136,7 @@ _CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, } } -_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { +_CLC_DEF float __clc_removePi2S(private float *hi, private float *lo, float x) { // 72 bits of pi/2 const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f; const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f; @@ -174,7 +174,8 @@ _CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { return fnpi2; } -_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionSmallS(private float *r, private float *rr, + float x) { float fnpi2 = __clc_removePi2S(r, rr, x); return (int)fnpi2 & 0x3; } @@ -188,7 +189,8 @@ _CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { HI = __clc_mul_hi(A, B); \ HI += LO < C -_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr, + float x) { int xe = (int)(as_uint(x) >> 23) - 127; uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); @@ -330,7 +332,7 @@ _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { return ((i >> 1) + (i & 1)) & 0x3; } -_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionS(private float *r, private float *rr, float x) { if (x < 0x1.0p+23f) return __clc_argReductionSmallS(r, rr, x); else @@ -342,8 +344,9 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { #pragma OPENCL EXTENSION cl_khr_fp64 : enable // Reduction for medium sized arguments -_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, - int *regn) { +_CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r, + private double *rr, + private int *regn) { // How many pi/2 is x a multiple of? const double two_by_pi = 0x1.45f306dc9c883p-1; double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5)); @@ -387,8 +390,9 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, // Return value "regn" tells how many lots of pi/2 were subtracted // from x to put it in the range [-pi/4,pi/4], mod 4. -_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, - int *regn) { +_CLC_DEF void __clc_remainder_piby2_large(double x, private double *r, + private double *rr, + private int *regn) { long ux = as_long(x); int e = (int)(ux >> 52) - 1023; diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h index 6dbca73aa2a2e..c7981e5278f2a 100644 --- a/libclc/generic/lib/math/sincos_helpers.h +++ b/libclc/generic/lib/math/sincos_helpers.h @@ -26,16 +26,18 @@ _CLC_DECL float __clc_sinf_piby4(float x, float y); _CLC_DECL float __clc_cosf_piby4(float x, float y); _CLC_DECL float __clc_tanf_piby4(float x, int y); -_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x); +_CLC_DECL int __clc_argReductionS(private float *r, private float *rr, float x); #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, - int *regn); -_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, - int *regn); +_CLC_DECL void __clc_remainder_piby2_medium(double x, private double *r, + private double *rr, + private int *regn); +_CLC_DECL void __clc_remainder_piby2_large(double x, private double *r, + private double *rr, + private int *regn); _CLC_DECL double2 __clc_sincos_piby4(double x, double xx); #endif diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index ccaa784ccb088..dcf9838edd74b 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -174,7 +174,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_atomic_flag_test`` ``201907L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_atomic_float`` *unimplemented* + ``__cpp_lib_atomic_float`` ``201711L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_atomic_lock_free_type_aliases`` ``201907L`` ---------------------------------------------------------- ----------------- diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 2439360797023..88a0666611a9a 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -43,8 +43,8 @@ Implemented Papers Improvements and New Features ----------------------------- -- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector::iterator``\s, - resulting in a performance improvement of up to 2000x. +- The ``std::ranges::{copy, copy_n, copy_backward, move, move_backward}`` algorithms have been optimized for + ``std::vector::iterator``, resulting in a performance improvement of up to 2000x. - Updated formatting library to Unicode 16.0.0. diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv index 360b5520260ce..1c060c32b6f57 100644 --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -2,7 +2,7 @@ "`P0463R1 `__","Endian just Endian","2017-07 (Toronto)","|Complete|","7","" "`P0674R1 `__","Extending make_shared to Support Arrays","2017-07 (Toronto)","|Complete|","15","" "","","","","","" -"`P0020R6 `__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","" +"`P0020R6 `__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","The feature-test macro was not set until LLVM 20." "`P0053R7 `__","C++ Synchronized Buffered Ostream","2017-11 (Albuquerque)","|Complete|","18","" "`P0202R3 `__","Add constexpr modifiers to functions in and Headers","2017-11 (Albuquerque)","|Complete|","12","" "`P0415R1 `__","Constexpr for ``std::complex``\ ","2017-11 (Albuquerque)","|Complete|","16","" diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h index 6f3b0eb5d2927..a3320e9f1985d 100644 --- a/libcxx/include/__algorithm/move.h +++ b/libcxx/include/__algorithm/move.h @@ -9,11 +9,13 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_H #define _LIBCPP___ALGORITHM_MOVE_H +#include <__algorithm/copy.h> #include <__algorithm/copy_move_common.h> #include <__algorithm/for_each_segment.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/common_type.h> @@ -98,6 +100,14 @@ struct __move_impl { } } + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> > + operator()(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { + return std::__copy(__first, __last, __result); + } + // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h index 24a8d9b24527a..14482fee18114 100644 --- a/libcxx/include/__algorithm/move_backward.h +++ b/libcxx/include/__algorithm/move_backward.h @@ -9,10 +9,12 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_BACKWARD_H #define _LIBCPP___ALGORITHM_MOVE_BACKWARD_H +#include <__algorithm/copy_backward.h> #include <__algorithm/copy_move_common.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/common_type.h> @@ -107,6 +109,14 @@ struct __move_backward_impl { } } + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> > + operator()(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { + return std::__copy_backward<_ClassicAlgPolicy>(__first, __last, __result); + } + // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference index aad470394732c..377f5fed12266 100644 --- a/libcxx/include/__bit_reference +++ b/libcxx/include/__bit_reference @@ -210,22 +210,6 @@ private: __mask_(__m) {} }; -// move - -template -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> -move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - return std::copy(__first, __last, __result); -} - -// move_backward - -template -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> move_backward( - __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - return std::copy_backward(__first, __last, __result); -} - // swap_ranges template diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h index cff99376ee24b..8d0f8f63f5213 100644 --- a/libcxx/include/__configuration/platform.h +++ b/libcxx/include/__configuration/platform.h @@ -32,12 +32,14 @@ // Need to detect which libc we're using if we're on Linux. #if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__) -# include -# if defined(__GLIBC_PREREQ) -# define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) -# else -# define _LIBCPP_GLIBC_PREREQ(a, b) 0 -# endif // defined(__GLIBC_PREREQ) +# if __has_include() +# include +# if defined(__GLIBC_PREREQ) +# define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) +# else +# define _LIBCPP_GLIBC_PREREQ(a, b) 0 +# endif // defined(__GLIBC_PREREQ) +# endif #endif #ifndef __BYTE_ORDER__ diff --git a/libcxx/include/__variant/monostate.h b/libcxx/include/__variant/monostate.h index c5d2dacaf4205..b29bbdf5cdbe4 100644 --- a/libcxx/include/__variant/monostate.h +++ b/libcxx/include/__variant/monostate.h @@ -49,10 +49,12 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(monostate, monostate) noe template <> struct _LIBCPP_TEMPLATE_VIS hash { - using argument_type = monostate; - using result_type = size_t; +# if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS) + using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = monostate; + using result_type _LIBCPP_DEPRECATED_IN_CXX17 = size_t; +# endif - inline _LIBCPP_HIDE_FROM_ABI result_type operator()(const argument_type&) const _NOEXCEPT { + inline _LIBCPP_HIDE_FROM_ABI size_t operator()(const monostate&) const noexcept { return 66740831; // return a fundamentally attractive random value. } }; diff --git a/libcxx/include/string b/libcxx/include/string index 396e73522d3e7..3f43e8fd8d586 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -691,50 +691,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD // basic_string -template -basic_string<_CharT, _Traits, _Allocator> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -_LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const _CharT* __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(_CharT __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const _CharT* __y); - template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, _CharT __y); - -# if _LIBCPP_STD_VER >= 26 - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - type_identity_t> __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs); - -# endif - -extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ - , allocator >(char const*, string const&); +__concatenate_strings(const _Allocator& __alloc, + __type_identity_t > __str1, + __type_identity_t > __str2); template struct __string_is_trivial_iterator : public false_type {}; @@ -2425,15 +2386,8 @@ private: std::__throw_out_of_range("basic_string"); } - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const value_type*, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(value_type, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const value_type*); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, value_type); -# if _LIBCPP_STD_VER >= 26 - friend constexpr basic_string operator+ <>(const basic_string&, type_identity_t<__self_view>); - friend constexpr basic_string operator+ <>(type_identity_t<__self_view>, const basic_string&); -# endif + friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string + __concatenate_strings<>(const _Allocator&, __type_identity_t<__self_view>, __type_identity_t<__self_view>); template friend inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool @@ -3815,83 +3769,73 @@ operator>=(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) { +__concatenate_strings(const _Allocator& __alloc, + __type_identity_t > __str1, + __type_identity_t > __str2) { using _String = basic_string<_CharT, _Traits, _Allocator>; - auto __lhs_sz = __lhs.size(); - auto __rhs_sz = __rhs.size(); _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); + __str1.size() + __str2.size(), + _String::__alloc_traits::select_on_container_copy_construction(__alloc)); auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); + _Traits::copy(__ptr, __str1.data(), __str1.size()); + _Traits::copy(__ptr + __str1.size(), __str2.data(), __str2.size()); + _Traits::assign(__ptr[__str1.size() + __str2.size()], _CharT()); return __r; } +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); +} + template _LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - auto __lhs_sz = _Traits::length(__lhs); - auto __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs, __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>(__rhs.get_allocator(), __lhs, __rhs); } +extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ + , allocator >(char const*, string const&); + template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(_CharT __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __rhs_sz + 1, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::assign(__ptr, 1, __lhs); - _Traits::copy(__ptr + 1, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + 1 + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>( + __rhs.get_allocator(), basic_string_view<_CharT, _Traits>(&__lhs, 1), __rhs); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = _Traits::length(__rhs); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs, __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); } template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, _CharT __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + 1, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::assign(__ptr + __lhs_sz, 1, __rhs); - _Traits::assign(__ptr + 1 + __lhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>( + __lhs.get_allocator(), __lhs, basic_string_view<_CharT, _Traits>(&__rhs, 1)); +} +# if _LIBCPP_STD_VER >= 26 + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + type_identity_t> __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); +} + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__rhs.get_allocator(), __lhs, __rhs); } +# endif // _LIBCPP_STD_VER >= 26 + # ifndef _LIBCPP_CXX03_LANG template @@ -3942,54 +3886,18 @@ operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, _CharT __rhs) { # if _LIBCPP_STD_VER >= 26 -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - type_identity_t> __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; -} - template _LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs) { - __lhs.append(__rhs); - return std::move(__lhs); -} - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::move(__lhs.append(__rhs)); } template _LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs) { - __rhs.insert(0, __lhs); - return std::move(__rhs); + return std::move(__rhs.insert(0, __lhs)); } # endif // _LIBCPP_STD_VER >= 26 diff --git a/libcxx/include/variant b/libcxx/include/variant index 3786d9524020b..9998d4a457715 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1585,10 +1585,12 @@ swap(variant<_Types...>& __lhs, template struct _LIBCPP_TEMPLATE_VIS hash< __enable_hash_helper, remove_const_t<_Types>...>> { - using argument_type = variant<_Types...>; - using result_type = size_t; +# if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS) + using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = variant<_Types...>; + using result_type _LIBCPP_DEPRECATED_IN_CXX17 = size_t; +# endif - _LIBCPP_HIDE_FROM_ABI result_type operator()(const argument_type& __v) const { + _LIBCPP_HIDE_FROM_ABI size_t operator()(const variant<_Types...>& __v) const { using __variant_detail::__visitation::__variant; size_t __res = __v.valueless_by_exception() diff --git a/libcxx/include/version b/libcxx/include/version index c5966b90c061d..63ead9fd5d29d 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -378,7 +378,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_array_constexpr 201811L # define __cpp_lib_assume_aligned 201811L # define __cpp_lib_atomic_flag_test 201907L -// # define __cpp_lib_atomic_float 201711L +# define __cpp_lib_atomic_float 201711L # define __cpp_lib_atomic_lock_free_type_aliases 201907L # define __cpp_lib_atomic_ref 201806L // # define __cpp_lib_atomic_shared_ptr 201711L diff --git a/libcxx/test/benchmarks/algorithms/move.bench.cpp b/libcxx/test/benchmarks/algorithms/move.bench.cpp new file mode 100644 index 0000000000000..73f36f0c129de --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/move.bench.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include + +template +void bm_ranges_move_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + if constexpr (aligned) { + benchmark::DoNotOptimize(std::ranges::move(*in, std::ranges::begin(*out))); + } else { + benchmark::DoNotOptimize( + std::ranges::move(std::views::counted(in->begin() + 4, n - 4), std::ranges::begin(*out))); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +template +void bm_move_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + auto first1 = in->begin(); + auto last1 = in->end(); + auto first2 = out->begin(); + if constexpr (aligned) { + benchmark::DoNotOptimize(std::move(first1, last1, first2)); + } else { + benchmark::DoNotOptimize(std::move(first1 + 4, last1, first2)); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(bm_ranges_move_vb) + ->Name("bm_ranges_move_vb_aligned") + ->Range(8, 1 << 16) + ->DenseRange(102400, 204800, 4096); +BENCHMARK(bm_ranges_move_vb)->Name("bm_ranges_move_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK(bm_move_vb)->Name("bm_move_vb_aligned")->Range(8, 1 << 20); +BENCHMARK(bm_move_vb)->Name("bm_move_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp b/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp new file mode 100644 index 0000000000000..23d7395198419 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include + +template +void bm_ranges_move_backward_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + if constexpr (aligned) { + benchmark::DoNotOptimize(std::ranges::move_backward(*in, std::ranges::end(*out))); + } else { + benchmark::DoNotOptimize( + std::ranges::move_backward(std::views::counted(in->begin(), n - 4), std::ranges::end(*out))); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +template +void bm_move_backward_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + auto first1 = in->begin(); + auto last1 = in->end(); + auto last2 = out->end(); + if constexpr (aligned) { + benchmark::DoNotOptimize(std::move_backward(first1, last1, last2)); + } else { + benchmark::DoNotOptimize(std::move_backward(first1, last1 - 4, last2)); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(bm_ranges_move_backward_vb) + ->Name("bm_ranges_move_backward_vb_aligned") + ->Range(8, 1 << 16) + ->DenseRange(102400, 204800, 4096); +BENCHMARK(bm_ranges_move_backward_vb)->Name("bm_ranges_move_backward_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK(bm_move_backward_vb)->Name("bm_move_backward_vb_aligned")->Range(8, 1 << 20); +BENCHMARK(bm_move_backward_vb)->Name("bm_move_backward_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp index 1ca397c92a334..3d4ee23a5a7ff 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp @@ -18,6 +18,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -81,7 +82,7 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } TEST_CONSTEXPR_CXX20 bool test() { - types::for_each(types::cpp17_input_iterator_list(), TestInIters()); + types::for_each(types::cpp17_input_iterator_list(), TestInIters()); { // Make sure that padding bits aren't copied Derived src(1, 2, 3); @@ -91,7 +92,6 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::copy(a + 3, a + 10, a); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp index 445c7718e1111..8a528a96f5294 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp @@ -19,6 +19,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" class PaddedBase { @@ -36,21 +37,29 @@ class Derived : public PaddedBase { std::int8_t c_; }; -template -TEST_CONSTEXPR_CXX20 void test_copy_backward() { - { - const unsigned N = 1000; - int ia[N] = {}; - for (unsigned i = 0; i < N; ++i) - ia[i] = i; - int ib[N] = {0}; - - OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); - assert(base(r) == ib); - for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); +struct TestIterators { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each(types::bidirectional_iterator_list(), TestImpl()); } -} + + template + struct TestImpl { + template + TEST_CONSTEXPR_CXX20 void operator()() { + const unsigned N = 1000; + int ia[N] = {}; + for (unsigned i = 0; i < N; ++i) + ia[i] = i; + int ib[N] = {0}; + + OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); + assert(base(r) == ib); + for (unsigned i = 0; i < N; ++i) + assert(ia[i] == ib[i]); + } + }; +}; TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -70,31 +79,10 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } return true; -}; +} TEST_CONSTEXPR_CXX20 bool test() { - test_copy_backward, bidirectional_iterator >(); - test_copy_backward, random_access_iterator >(); - test_copy_backward, int*>(); - - test_copy_backward, bidirectional_iterator >(); - test_copy_backward, random_access_iterator >(); - test_copy_backward, int*>(); - - test_copy_backward >(); - test_copy_backward >(); - test_copy_backward(); - -#if TEST_STD_VER > 17 - test_copy_backward, bidirectional_iterator>(); - test_copy_backward, random_access_iterator>(); - test_copy_backward, int*>(); - - test_copy_backward, contiguous_iterator>(); - test_copy_backward, contiguous_iterator>(); - test_copy_backward, contiguous_iterator>(); - test_copy_backward>(); -#endif + types::for_each(types::bidirectional_iterator_list(), TestIterators()); { // Make sure that padding bits aren't copied Derived src(1, 2, 3); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp index 57214e65455b4..3bee77738e342 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp @@ -19,75 +19,48 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" -struct Pred -{ - TEST_CONSTEXPR_CXX14 bool operator()(int i) {return i % 3 == 0;} +struct Pred { + TEST_CONSTEXPR_CXX14 bool operator()(int i) { return i % 3 == 0; } }; -template -TEST_CONSTEXPR_CXX20 void -test_copy_if() -{ +template +struct TestOutIters { + template + TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::copy_if(InIter(ia), InIter(ia+N), OutIter(ib), Pred()); - assert(base(r) == ib+N/3+1); - for (unsigned i = 0; i < N/3+1; ++i) - assert(ib[i] % 3 == 0); -} - -TEST_CONSTEXPR_CXX20 bool -test() -{ - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); + OutIter r = std::copy_if(InIter(ia), InIter(ia + N), OutIter(ib), Pred()); + assert(base(r) == ib + N / 3 + 1); + for (unsigned i = 0; i < N / 3 + 1; ++i) + assert(ib[i] % 3 == 0); + } +}; - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if(); +struct TestInIters { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each( + types::concatenate_t, types::type_list > >(), + TestOutIters()); + } +}; +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::cpp17_input_iterator_list(), TestInIters()); return true; } -int main(int, char**) -{ - test(); +int main(int, char**) { + test(); #if TEST_STD_VER > 17 - static_assert(test()); + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp index 889e71f4eceb9..2053134a01a2f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp @@ -18,6 +18,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" typedef UserDefinedIntegral UDI; @@ -37,37 +38,31 @@ class Derived : public PaddedBase { std::int8_t c_; }; -template -TEST_CONSTEXPR_CXX20 void test_copy_n() { - { - const unsigned N = 1000; - int ia[N] = {}; - for (unsigned i = 0; i < N; ++i) - ia[i] = i; - int ib[N] = {0}; - - OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib)); - assert(base(r) == ib + N / 2); - for (unsigned i = 0; i < N / 2; ++i) - assert(ia[i] == ib[i]); +struct TestIterators { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each( + types::concatenate_t, types::type_list > >(), + TestImpl()); } - { // Make sure that padding bits aren't copied - Derived src(1, 2, 3); - Derived dst(4, 5, 6); - std::copy_n(static_cast(&src), 1, static_cast(&dst)); - assert(dst.a_ == 1); - assert(dst.b_ == 2); - assert(dst.c_ == 6); - } - - { // Make sure that overlapping ranges can be copied - int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - std::copy_n(a + 3, 7, a); - int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; - assert(std::equal(a, a + 10, expected)); - } -} + template + struct TestImpl { + template + TEST_CONSTEXPR_CXX20 void operator()() { + const unsigned N = 1000; + int ia[N] = {}; + for (unsigned i = 0; i < N; ++i) + ia[i] = i; + int ib[N] = {0}; + + OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib)); + assert(base(r) == ib + N / 2); + for (unsigned i = 0; i < N / 2; ++i) + assert(ia[i] == ib[i]); + } + }; +}; TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -90,40 +85,23 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } TEST_CONSTEXPR_CXX20 bool test() { - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n(); + types::for_each(types::cpp17_input_iterator_list(), TestIterators()); + + { // Make sure that padding bits aren't copied + Derived src(1, 2, 3); + Derived dst(4, 5, 6); + std::copy_n(static_cast(&src), 1, static_cast(&dst)); + assert(dst.a_ == 1); + assert(dst.b_ == 2); + assert(dst.c_ == 6); + } + + { // Make sure that overlapping ranges can be copied + int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + std::copy_n(a + 3, 7, a); + int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; + assert(std::equal(a, a + 10, expected)); + } { // Test vector::iterator optimization assert(test_vector_bool(8)); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp index bee1ef9bcec33..6229aac733a9c 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(copy); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp index 128108ac13811..7208be75c70d0 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(copy_n); @@ -58,7 +59,7 @@ struct TestIteratorsInt { }; struct CopiedToTester { - bool copied_to = false; + bool copied_to = false; CopiedToTester() = default; CopiedToTester(const CopiedToTester&) {} CopiedToTester& operator=(const CopiedToTester&) { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp index c7031f63a02f6..577328d663d9f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp @@ -40,7 +40,7 @@ static_assert(!HasCopyNIt, std::ranges::in_out_result>); -template +template constexpr void test_iterators() { { // simple test std::array in{1, 2, 3, 4}; @@ -61,26 +61,6 @@ constexpr void test_iterators() { } } -template -constexpr void test_in_iterators() { - test_iterators, Out, sentinel_wrapper>>(); - test_iterators, Out>(); - test_iterators, Out>(); - test_iterators, Out>(); - test_iterators, Out>(); -} - -template -constexpr void test_proxy_in_iterators() { - test_iterators>, - Out, - sentinel_wrapper>>>(); - test_iterators>, Out>(); - test_iterators>, Out>(); - test_iterators>, Out>(); - test_iterators>, Out>(); -} - #if TEST_STD_VER >= 23 constexpr bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -104,17 +84,12 @@ constexpr bool test_vector_bool(std::size_t N) { #endif constexpr bool test() { - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); + types::for_each(types::cpp20_input_iterator_list{}, []() { + types::for_each(types::cpp20_input_iterator_list{}, []() { + test_iterators(); + test_iterators, ProxyIterator>(); + }); + }); { // check that every element is copied exactly once struct CopyOnce { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp index 7656be73c14c6..0e532ae834e7f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp @@ -22,6 +22,7 @@ #include "sized_allocator.h" #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" template TEST_CONSTEXPR_CXX20 void diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp index 3b67101a8b29e..98c412fb6cdc0 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp @@ -14,108 +14,93 @@ // fill_n(Iter first, Size n, const T& value); #include +#include #include +#include #include #include "sized_allocator.h" #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" -#if TEST_STD_VER > 17 -TEST_CONSTEXPR bool test_constexpr() { - const std::size_t N = 5; - int ib[] = {0, 0, 0, 0, 0, 0}; // one bigger than N - - auto it = std::fill_n(std::begin(ib), N, 5); - return it == (std::begin(ib) + N) && std::all_of(std::begin(ib), it, [](int a) { return a == 5; }) && - *it == 0 // don't overwrite the last value in the output array - ; -} -#endif - typedef UserDefinedIntegral UDI; -template -void test_char() { - char a[4] = {}; - Iter it = std::fill_n(Iter(a), UDI(4), char(1)); - assert(base(it) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); +template +TEST_CONSTEXPR_CXX20 void +test(Container in, size_t from, size_t n, typename Container::value_type value, Container expected) { + Iter it = std::fill_n(Iter(in.data() + from), UDI(n), value); + assert(base(it) == in.data() + from + n); + assert(in == expected); } -template -void test_int() { - int a[4] = {}; - Iter it = std::fill_n(Iter(a), UDI(4), 1); - assert(base(it) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); -} +template +struct Test { + template + TEST_CONSTEXPR_CXX20 void operator()() { + { + std::array in = {1, 2, 3, 4}; + std::array expected = {5, 5, 5, 5}; + test(in, 0, 4, 5, expected); + } + { + std::array in = {1, 2, 3, 4}; + std::array expected = {1, 5, 5, 4}; + test(in, 1, 2, 5, expected); + } + } +}; -void test_int_array() { - int a[4] = {}; - assert(std::fill_n(a, UDI(4), static_cast(1)) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); +TEST_CONSTEXPR_CXX20 void test_int_array() { + { + int a[4] = {}; + assert(std::fill_n(a, UDI(4), static_cast(1)) == a + 4); + assert(a[0] == 1 && a[1] == 1 && a[2] == 1 && a[3] == 1); + } +#if TEST_STD_VER >= 11 + { + const std::size_t N = 5; + int ib[] = {0, 0, 0, 0, 0, 0}; // one bigger than N + + auto it = std::fill_n(std::begin(ib), N, 5); + assert(it == (std::begin(ib) + N) && std::all_of(std::begin(ib), it, [](int a) { return a == 5; }) && + *it == 0 // don't overwrite the last value in the output array + ); + } +#endif } struct source { - source() : i(0) {} - - operator int() const { return i++; } - mutable int i; + TEST_CONSTEXPR source() = default; + TEST_CONSTEXPR_CXX20 operator int() const { return 1; } }; -void test_int_array_struct_source() { +TEST_CONSTEXPR_CXX20 void test_int_array_struct_source() { int a[4] = {}; assert(std::fill_n(a, UDI(4), source()) == a + 4); - assert(a[0] == 0); + assert(a[0] == 1); assert(a[1] == 1); - assert(a[2] == 2); - assert(a[3] == 3); -} - -struct test1 { - test1() : c(0) {} - test1(char xc) : c(xc + 1) {} - char c; -}; - -void test_struct_array() { - test1 test1a[4] = {}; - assert(std::fill_n(test1a, UDI(4), static_cast(10)) == test1a + 4); - assert(test1a[0].c == 11); - assert(test1a[1].c == 11); - assert(test1a[2].c == 11); - assert(test1a[3].c == 11); + assert(a[2] == 1); + assert(a[3] == 1); } class A { char a_; public: - A() {} - explicit A(char a) : a_(a) {} - operator unsigned char() const { return 'b'; } + TEST_CONSTEXPR A() : a_('a') {}; + TEST_CONSTEXPR explicit A(char a) : a_(a) {} + TEST_CONSTEXPR operator unsigned char() const { return 'b'; } - friend bool operator==(const A& x, const A& y) { return x.a_ == y.a_; } + TEST_CONSTEXPR friend bool operator==(const A& x, const A& y) { return x.a_ == y.a_; } }; -void test5() { - A a[3]; - assert(std::fill_n(&a[0], UDI(3), A('a')) == a + 3); - assert(a[0] == A('a')); - assert(a[1] == A('a')); - assert(a[2] == A('a')); -} +struct B { + TEST_CONSTEXPR B() : c(0) {} + TEST_CONSTEXPR B(char xc) : c(xc + 1) {} + char c; +}; struct Storage { union { @@ -124,11 +109,6 @@ struct Storage { }; }; -void test6() { - Storage foo[5]; - std::fill_n(&foo[0], UDI(5), Storage()); -} - // Make sure std::fill_n behaves properly with std::vector iterators with custom size types. // See https://github.com/llvm/llvm-project/pull/122410. TEST_CONSTEXPR_CXX20 void test_bititer_with_custom_sized_types() { @@ -162,30 +142,44 @@ TEST_CONSTEXPR_CXX20 void test_bititer_with_custom_sized_types() { } } -int main(int, char**) { - test_char >(); - test_char >(); - test_char >(); - test_char >(); - test_char(); - - test_int >(); - test_int >(); - test_int >(); - test_int >(); - test_int(); +TEST_CONSTEXPR_CXX20 void test_struct_array() { + { + A a[3]; + assert(std::fill_n(&a[0], UDI(3), A('a')) == a + 3); + assert(a[0] == A('a')); + assert(a[1] == A('a')); + assert(a[2] == A('a')); + } + { + B b[4] = {}; + assert(std::fill_n(b, UDI(4), static_cast(10)) == b + 4); + assert(b[0].c == 11); + assert(b[1].c == 11); + assert(b[2].c == 11); + assert(b[3].c == 11); + } + { + Storage foo[5]; + std::fill_n(&foo[0], UDI(5), Storage()); + } +} + +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::forward_iterator_list(), Test()); + types::for_each(types::forward_iterator_list(), Test()); test_int_array(); - test_int_array_struct_source(); test_struct_array(); - - test5(); - test6(); - + test_int_array_struct_source(); test_bititer_with_custom_sized_types(); -#if TEST_STD_VER > 17 - static_assert(test_constexpr()); + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 20 + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp index 556326fb0894c..e456fa8986aad 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(fill); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp index 4abbd6f7a17c3..51232dfef1606 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(fill_n); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp index b1ad6873bc5e5..e28484ee4984b 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp @@ -20,10 +20,12 @@ #include #include #include +#include #include "MoveOnly.h" #include "test_iterators.h" #include "test_macros.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -45,15 +47,15 @@ struct Test { template TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::move(InIter(ia), InIter(ia+N), OutIter(ib)); - assert(base(r) == ib+N); + OutIter r = std::move(InIter(ia), InIter(ia + N), OutIter(ib)); + assert(base(r) == ib + N); for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); + assert(ia[i] == ib[i]); } }; @@ -73,13 +75,13 @@ struct Test1 { const unsigned N = 100; std::unique_ptr ia[N]; for (unsigned i = 0; i < N; ++i) - ia[i].reset(new int(i)); + ia[i].reset(new int(i)); std::unique_ptr ib[N]; - OutIter r = std::move(InIter(ia), InIter(ia+N), OutIter(ib)); - assert(base(r) == ib+N); + OutIter r = std::move(InIter(ia), InIter(ia + N), OutIter(ib)); + assert(base(r) == ib + N); for (unsigned i = 0; i < N; ++i) - assert(*ib[i] == static_cast(i)); + assert(*ib[i] == static_cast(i)); } }; @@ -92,11 +94,32 @@ struct Test1OutIters { } }; +TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move with aligned bytes + std::vector in(v); + std::vector out(N); + std::move(in.begin(), in.end(), out.begin()); + assert(out == v); + } + { // Test move with unaligned bytes + std::vector in(v); + std::vector out(N); + std::move(in.begin() + 4, in.end(), out.begin()); + for (std::size_t i = 0; i < N - 4; ++i) + assert(v[i + 4] == out[i]); + } + + return true; +} + TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::cpp17_input_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) types::for_each(types::cpp17_input_iterator_list*>(), Test1OutIters()); - { // Make sure that padding bits aren't copied Derived src(1, 2, 3); Derived dst(4, 5, 6); @@ -105,20 +128,17 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::move(a + 3, a + 10, a); int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; assert(std::equal(a, a + 10, expected)); } - - // Make sure that the algorithm works with move-only types - { + { // Make sure that the algorithm works with move-only types // When non-trivial { MoveOnly from[3] = {1, 2, 3}; - MoveOnly to[3] = {}; + MoveOnly to[3] = {}; std::move(std::begin(from), std::end(from), std::begin(to)); assert(to[0] == MoveOnly(1)); assert(to[1] == MoveOnly(2)); @@ -127,7 +147,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When trivial { TrivialMoveOnly from[3] = {1, 2, 3}; - TrivialMoveOnly to[3] = {}; + TrivialMoveOnly to[3] = {}; std::move(std::begin(from), std::end(from), std::begin(to)); assert(to[0] == TrivialMoveOnly(1)); assert(to[1] == TrivialMoveOnly(2)); @@ -135,6 +155,16 @@ TEST_CONSTEXPR_CXX20 bool test() { } } + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } + return true; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp index 61dea47b51071..d8b7e68b155d6 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp @@ -19,10 +19,12 @@ #include #include #include +#include #include "MoveOnly.h" #include "test_iterators.h" #include "test_macros.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -44,24 +46,22 @@ struct Test { template TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::move_backward(InIter(ia), InIter(ia+N), OutIter(ib+N)); + OutIter r = std::move_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); + assert(ia[i] == ib[i]); } }; struct TestOutIters { template TEST_CONSTEXPR_CXX20 void operator()() { - types::for_each( - types::concatenate_t >(), - Test()); + types::for_each(types::concatenate_t >(), Test()); } }; @@ -72,29 +72,50 @@ struct Test1 { const unsigned N = 100; std::unique_ptr ia[N]; for (unsigned i = 0; i < N; ++i) - ia[i].reset(new int(i)); + ia[i].reset(new int(i)); std::unique_ptr ib[N]; - OutIter r = std::move_backward(InIter(ia), InIter(ia+N), OutIter(ib+N)); + OutIter r = std::move_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) - assert(*ib[i] == static_cast(i)); + assert(*ib[i] == static_cast(i)); } }; struct Test1OutIters { template TEST_CONSTEXPR_CXX23 void operator()() { - types::for_each(types::concatenate_t*> >(), - Test1()); + types::for_each( + types::concatenate_t*> >(), Test1()); } }; +TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move_backward with aligned bytes + std::vector in(v); + std::vector out(N); + std::move_backward(in.begin(), in.end(), out.end()); + assert(out == v); + } + { // Test move_backward with unaligned bytes + std::vector in(v); + std::vector out(N); + std::move_backward(in.begin(), in.end() - 4, out.end()); + for (std::size_t i = 0; i < N - 4; ++i) + assert(out[i + 4] == v[i]); + } + + return true; +} + TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::bidirectional_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) types::for_each(types::bidirectional_iterator_list*>(), Test1OutIters()); - { // Make sure that padding bits aren't copied Derived src(1, 2, 3); Derived dst(4, 5, 6); @@ -104,20 +125,17 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::move_backward(a, a + 7, a + 10); int expected[] = {1, 2, 3, 1, 2, 3, 4, 5, 6, 7}; assert(std::equal(a, a + 10, expected)); } - - // Make sure that the algorithm works with move-only types - { + { // Make sure that the algorithm works with move-only types // When non-trivial { MoveOnly from[3] = {1, 2, 3}; - MoveOnly to[3] = {}; + MoveOnly to[3] = {}; std::move_backward(std::begin(from), std::end(from), std::end(to)); assert(to[0] == MoveOnly(1)); assert(to[1] == MoveOnly(2)); @@ -126,7 +144,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When trivial { TrivialMoveOnly from[3] = {1, 2, 3}; - TrivialMoveOnly to[3] = {}; + TrivialMoveOnly to[3] = {}; std::move_backward(std::begin(from), std::end(from), std::end(to)); assert(to[0] == TrivialMoveOnly(1)); assert(to[1] == TrivialMoveOnly(2)); @@ -134,11 +152,20 @@ TEST_CONSTEXPR_CXX20 bool test() { } } + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } + return true; } -int main(int, char**) -{ +int main(int, char**) { test(); #if TEST_STD_VER >= 20 static_assert(test()); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp index e4cc5649ce5d8..a82a068caf031 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(move); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp index a0d1473360a14..1a89408865892 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp @@ -31,6 +31,7 @@ #include "almost_satisfies_types.h" #include "MoveOnly.h" #include "test_iterators.h" +#include "test_macros.h" template > concept HasMoveIt = requires(In in, Sent sent, Out out) { std::ranges::move(in, sent, out); }; @@ -65,7 +66,7 @@ constexpr void test(std::array in) { { std::array out; std::same_as> decltype(auto) ret = - std::ranges::move(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data())); + std::ranges::move(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data() + out.size()); @@ -73,8 +74,7 @@ constexpr void test(std::array in) { { std::array out; auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size()))); - std::same_as> decltype(auto) ret = - std::ranges::move(range, Out(out.data())); + std::same_as> decltype(auto) ret = std::ranges::move(range, Out(out.data())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data() + out.size()); @@ -84,16 +84,16 @@ constexpr void test(std::array in) { template constexpr void test_containers() { { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); std::same_as> auto ret = - std::ranges::move(In(in.begin()), Sent(In(in.end())), Out(out.begin())); + std::ranges::move(In(in.begin()), Sent(In(in.end())), Out(out.begin())); assert(std::ranges::equal(in, out)); assert(base(ret.in) == in.end()); assert(base(ret.out) == out.end()); } { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end()))); std::same_as> auto ret = std::ranges::move(range, Out(out.begin())); @@ -165,22 +165,52 @@ constexpr void test_proxy_in_iterators() { } struct IteratorWithMoveIter { - using value_type = int; - using difference_type = int; + using value_type = int; + using difference_type = int; explicit IteratorWithMoveIter() = default; int* ptr; constexpr IteratorWithMoveIter(int* ptr_) : ptr(ptr_) {} constexpr int& operator*() const; // iterator with iter_move should not be dereferenced - constexpr IteratorWithMoveIter& operator++() { ++ptr; return *this; } - constexpr IteratorWithMoveIter operator++(int) { auto ret = *this; ++*this; return ret; } + constexpr IteratorWithMoveIter& operator++() { + ++ptr; + return *this; + } + constexpr IteratorWithMoveIter operator++(int) { + auto ret = *this; + ++*this; + return ret; + } friend constexpr int iter_move(const IteratorWithMoveIter&) { return 42; } constexpr bool operator==(const IteratorWithMoveIter& other) const = default; }; +#if TEST_STD_VER >= 23 +constexpr bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move with aligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move(in, out.begin()); + assert(out == v); + } + { // Test move with unaligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move(std::views::counted(in.begin() + 4, N - 4), out.begin()); + assert(std::ranges::equal(v | std::views::drop(4), out | std::views::take(N - 4))); + } + + return true; +} +#endif + // cpp17_intput_iterator has a defaulted template argument template using Cpp17InIter = cpp17_input_iterator; @@ -267,13 +297,13 @@ constexpr bool test() { { // check that ranges::dangling is returned std::array out; std::same_as> decltype(auto) ret = - std::ranges::move(std::array {1, 2, 3, 4}, out.data()); + std::ranges::move(std::array{1, 2, 3, 4}, out.data()); assert(ret.out == out.data() + 4); assert((out == std::array{1, 2, 3, 4})); } { // check that an iterator is returned with a borrowing range - std::array in {1, 2, 3, 4}; + std::array in{1, 2, 3, 4}; std::array out; std::same_as::iterator, int*>> decltype(auto) ret = std::ranges::move(std::views::all(in), out.data()); @@ -284,8 +314,8 @@ constexpr bool test() { { // check that every element is moved exactly once struct MoveOnce { - bool moved = false; - constexpr MoveOnce() = default; + bool moved = false; + constexpr MoveOnce() = default; constexpr MoveOnce(const MoveOnce& other) = delete; constexpr MoveOnce& operator=(MoveOnce&& other) { assert(!other.moved); @@ -294,16 +324,16 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.moved; })); } { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move(in, out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); @@ -314,8 +344,8 @@ constexpr bool test() { { // check that the range is moved forwards struct OnlyForwardsMovable { OnlyForwardsMovable* next = nullptr; - bool canMove = false; - OnlyForwardsMovable() = default; + bool canMove = false; + OnlyForwardsMovable() = default; constexpr OnlyForwardsMovable& operator=(OnlyForwardsMovable&&) { assert(canMove); if (next != nullptr) @@ -324,12 +354,12 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; - out[0].next = &out[1]; - out[1].next = &out[2]; + std::array in{}; + std::array out{}; + out[0].next = &out[1]; + out[1].next = &out[2]; out[0].canMove = true; - auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); + auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(out[0].canMove); @@ -337,12 +367,12 @@ constexpr bool test() { assert(out[2].canMove); } { - std::array in {}; - std::array out {}; - out[0].next = &out[1]; - out[1].next = &out[2]; + std::array in{}; + std::array out{}; + out[0].next = &out[1]; + out[1].next = &out[2]; out[0].canMove = true; - auto ret = std::ranges::move(in, out.begin()); + auto ret = std::ranges::move(in, out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(out[0].canMove); @@ -358,19 +388,31 @@ constexpr bool test() { auto ret = std::ranges::move(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4), b.data()); assert(ret.in == a + 4); assert(ret.out == b.data() + 4); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } { int a[] = {1, 2, 3, 4}; std::array b; auto range = std::ranges::subrange(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4)); - auto ret = std::ranges::move(range, b.data()); + auto ret = std::ranges::move(range, b.data()); assert(ret.in == a + 4); assert(ret.out == b.data() + 4); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } } +#if TEST_STD_VER >= 23 + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } +#endif + return true; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp index 47cf178636ad1..923b4c790dd1d 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp @@ -31,6 +31,7 @@ #include "almost_satisfies_types.h" #include "MoveOnly.h" #include "test_iterators.h" +#include "test_macros.h" template > concept HasMoveBackwardIt = requires(In in, Sent sent, Out out) { std::ranges::move_backward(in, sent, out); }; @@ -65,7 +66,7 @@ constexpr void test(std::array in) { { std::array out; std::same_as> decltype(auto) ret = - std::ranges::move_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size())); + std::ranges::move_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data()); @@ -92,16 +93,16 @@ constexpr void test_iterators() { template constexpr void test_containers() { { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); std::same_as> auto ret = - std::ranges::move_backward(In(in.begin()), Sent(In(in.end())), Out(out.end())); + std::ranges::move_backward(In(in.begin()), Sent(In(in.end())), Out(out.end())); assert(std::ranges::equal(in, out)); assert(base(ret.in) == in.end()); assert(base(ret.out) == out.begin()); } { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end()))); std::same_as> auto ret = std::ranges::move_backward(range, Out(out.end())); @@ -159,25 +160,62 @@ constexpr void test_proxy_in_iterators() { } struct IteratorWithMoveIter { - using value_type = int; - using difference_type = int; + using value_type = int; + using difference_type = int; explicit IteratorWithMoveIter() = default; int* ptr; constexpr IteratorWithMoveIter(int* ptr_) : ptr(ptr_) {} constexpr int& operator*() const; // iterator with iter_move should not be dereferenced - constexpr IteratorWithMoveIter& operator++() { ++ptr; return *this; } - constexpr IteratorWithMoveIter operator++(int) { auto ret = *this; ++*this; return ret; } + constexpr IteratorWithMoveIter& operator++() { + ++ptr; + return *this; + } + constexpr IteratorWithMoveIter operator++(int) { + auto ret = *this; + ++*this; + return ret; + } - constexpr IteratorWithMoveIter& operator--() { --ptr; return *this; } - constexpr IteratorWithMoveIter operator--(int) { auto ret = *this; --*this; return ret; } + constexpr IteratorWithMoveIter& operator--() { + --ptr; + return *this; + } + constexpr IteratorWithMoveIter operator--(int) { + auto ret = *this; + --*this; + return ret; + } friend constexpr int iter_move(const IteratorWithMoveIter&) { return 42; } constexpr bool operator==(const IteratorWithMoveIter& other) const = default; }; +#if TEST_STD_VER >= 23 +constexpr bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move_backward with aligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move_backward(in, out.end()); + assert(out == v); + } + { // Test move_backward with unaligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move_backward(std::views::counted(in.begin(), N - 4), out.end()); + assert(std::ranges::equal(v | std::views::take(N - 4), out | std::views::drop(4))); + } + + return true; +} +#endif + constexpr bool test() { test_in_iterators(); test_in_iterators(); @@ -243,7 +281,8 @@ constexpr bool test() { MoveOnly b[3]; ProxyRange proxyA{a}; ProxyRange proxyB{b}; - std::ranges::move_backward(std::begin(proxyA), std::end(proxyA), std::ranges::next(proxyB.begin(), std::end(proxyB))); + std::ranges::move_backward( + std::begin(proxyA), std::end(proxyA), std::ranges::next(proxyB.begin(), std::end(proxyB))); assert(b[0].get() == 1); assert(b[1].get() == 2); assert(b[2].get() == 3); @@ -253,13 +292,13 @@ constexpr bool test() { { // check that ranges::dangling is returned std::array out; std::same_as> auto ret = - std::ranges::move_backward(std::array {1, 2, 3, 4}, out.data() + out.size()); + std::ranges::move_backward(std::array{1, 2, 3, 4}, out.data() + out.size()); assert(ret.out == out.data()); assert((out == std::array{1, 2, 3, 4})); } { // check that an iterator is returned with a borrowing range - std::array in {1, 2, 3, 4}; + std::array in{1, 2, 3, 4}; std::array out; std::same_as::iterator, int*>> auto ret = std::ranges::move_backward(std::views::all(in), out.data() + out.size()); @@ -270,8 +309,8 @@ constexpr bool test() { { // check that every element is moved exactly once struct MoveOnce { - bool moved = false; - constexpr MoveOnce() = default; + bool moved = false; + constexpr MoveOnce() = default; constexpr MoveOnce(const MoveOnce& other) = delete; constexpr MoveOnce& operator=(const MoveOnce& other) { assert(!other.moved); @@ -280,16 +319,16 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.moved; })); } { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move_backward(in, out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); @@ -300,8 +339,8 @@ constexpr bool test() { { // check that the range is moved backwards struct OnlyBackwardsMovable { OnlyBackwardsMovable* next = nullptr; - bool canMove = false; - OnlyBackwardsMovable() = default; + bool canMove = false; + OnlyBackwardsMovable() = default; constexpr OnlyBackwardsMovable& operator=(const OnlyBackwardsMovable&) { assert(canMove); if (next != nullptr) @@ -310,12 +349,12 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; - out[1].next = &out[0]; - out[2].next = &out[1]; + std::array in{}; + std::array out{}; + out[1].next = &out[0]; + out[2].next = &out[1]; out[2].canMove = true; - auto ret = std::ranges::move_backward(in, out.end()); + auto ret = std::ranges::move_backward(in, out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(out[0].canMove); @@ -323,12 +362,12 @@ constexpr bool test() { assert(out[2].canMove); } { - std::array in {}; - std::array out {}; - out[1].next = &out[0]; - out[2].next = &out[1]; + std::array in{}; + std::array out{}; + out[1].next = &out[0]; + out[2].next = &out[1]; out[2].canMove = true; - auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); + auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(out[0].canMove); @@ -344,19 +383,31 @@ constexpr bool test() { auto ret = std::ranges::move_backward(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4), b.data() + b.size()); assert(ret.in == a + 4); assert(ret.out == b.data()); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } { int a[] = {1, 2, 3, 4}; std::array b; auto range = std::ranges::subrange(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4)); - auto ret = std::ranges::move_backward(range, b.data() + b.size()); + auto ret = std::ranges::move_backward(range, b.data() + b.size()); assert(ret.in == a + 4); assert(ret.out == b.data()); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } } +#if TEST_STD_VER >= 23 + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } +#endif + return true; } diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp index 9ed18fbfe19ac..5a21e6320bffe 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp @@ -169,17 +169,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++20" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++20" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++20" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++20" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -262,17 +256,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++23" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++23" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++23" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -355,17 +343,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++26" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++26" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++26" # endif # ifndef __cpp_lib_atomic_is_always_lock_free diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 137d6cb428930..1e4465d515e6b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -3282,17 +3282,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++20" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++20" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++20" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++20" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -4707,17 +4701,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++23" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++23" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++23" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -6369,17 +6357,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++26" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++26" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++26" # endif # ifndef __cpp_lib_atomic_is_always_lock_free diff --git a/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp b/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp new file mode 100644 index 0000000000000..9b22cbda9f345 --- /dev/null +++ b/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++17 + +#include + +#include "test_macros.h" + +using A1 [[maybe_unused]] = std::hash>::argument_type; +using R1 [[maybe_unused]] = std::hash>::result_type; +#if TEST_STD_VER >= 20 +// expected-error@-3 {{no type named 'argument_type' in 'std::hash>'}} +// expected-error@-3 {{no type named 'result_type' in 'std::hash>'}} +#else +// expected-warning@-6 {{'argument_type' is deprecated}} +// expected-warning@-6 {{'result_type' is deprecated}} +#endif + +using A2 [[maybe_unused]] = std::hash::argument_type; +using R2 [[maybe_unused]] = std::hash::result_type; +#if TEST_STD_VER >= 20 +// expected-error@-3 {{no type named 'argument_type' in 'std::hash'}} +// expected-error@-3 {{no type named 'result_type' in 'std::hash'}} +#else +// expected-warning@-6 {{'argument_type' is deprecated}} +// expected-warning@-6 {{'result_type' is deprecated}} +#endif diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 25168b9087754..8bf7633e985d5 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -169,7 +169,6 @@ def add_version_header(tc): "name": "__cpp_lib_atomic_float", "values": {"c++20": 201711}, "headers": ["atomic"], - "unimplemented": True, }, { "name": "__cpp_lib_atomic_is_always_lock_free", diff --git a/lld/test/ELF/aarch64-feature-gcs.s b/lld/test/ELF/aarch64-feature-gcs.s index 7a08673dbb7e6..b53a653dddaee 100644 --- a/lld/test/ELF/aarch64-feature-gcs.s +++ b/lld/test/ELF/aarch64-feature-gcs.s @@ -36,15 +36,15 @@ ## gcs-report should report any input files that don't have the gcs property. -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | count 0 -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | count 0 -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning -z gcs=always 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning -z gcs=never 2>&1 | count 0 # REPORT-WARN: warning: func2.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property # REPORT-ERROR: error: func3.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property diff --git a/lld/test/ELF/aarch64-relocs.s b/lld/test/ELF/aarch64-relocs.s index 198674c085b54..39cfcdd38661d 100644 --- a/lld/test/ELF/aarch64-relocs.s +++ b/lld/test/ELF/aarch64-relocs.s @@ -25,12 +25,10 @@ mystr: .asciz "blah" .size mystr, 4 -# PAGE(S + A) - PAGE(P) = PAGE(210136) - PAGE(0x210132) = 0 -# # CHECK: Disassembly of section .R_AARCH64_ADR_PREL_PG_HI21: # CHECK-EMPTY: # CHECK-NEXT: <.R_AARCH64_ADR_PREL_PG_HI21>: -# CHECK-NEXT: 210132: 90000001 adrp x1, 0x210000 +# CHECK-NEXT: adrp x1, 0x210000 .section .R_AARCH64_ADD_ABS_LO12_NC,"ax",@progbits add x0, x0, :lo12:.L.str @@ -64,39 +62,16 @@ foo: nop sub: nop - -# CHECK: Disassembly of section .SUB: -# CHECK-EMPTY: -# CHECK-NEXT: <.SUB>: -# CHECK-NEXT: 21014c: d503201f nop -# CHECK: : -# CHECK-NEXT: 210150: d503201f nop - .section .R_AARCH64_CALL26,"ax",@progbits call26: bl sub + b sub -# S = 0x21014c, A = 0x4, P = 0x210154 -# R = S + A - P = -0x4 = 0xfffffffc -# (R & 0x0ffffffc) >> 2 = 0x03ffffff -# 0x94000000 | 0x03ffffff = 0x97ffffff # CHECK: Disassembly of section .R_AARCH64_CALL26: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: 210154: 97ffffff bl 0x210150 - -.section .R_AARCH64_JUMP26,"ax",@progbits -jump26: - b sub - -# S = 0x21014c, A = 0x4, P = 0x210158 -# R = S + A - P = -0x8 = 0xfffffff8 -# (R & 0x0ffffffc) >> 2 = 0x03fffffe -# 0x14000000 | 0x03fffffe = 0x17fffffe -# CHECK: Disassembly of section .R_AARCH64_JUMP26: -# CHECK-EMPTY: -# CHECK-NEXT: : -# CHECK-NEXT: 210158: 17fffffe b 0x210150 +# CHECK-NEXT: bl {{.*}} +# CHECK-NEXT: b {{.*}} .section .R_AARCH64_LDST32_ABS_LO12_NC,"ax",@progbits ldst32: @@ -179,14 +154,14 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_UABS: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: f280018c movk x12, #12 -# CHECK-NEXT: f280018c movk x12, #12 -# CHECK-NEXT: f2a001ad movk x13, #13, lsl #16 -# CHECK-NEXT: f2a001ad movk x13, #13, lsl #16 -# CHECK-NEXT: f2c001ce movk x14, #14, lsl #32 -# CHECK-NEXT: f2c001ce movk x14, #14, lsl #32 -# CHECK-NEXT: d2e001ef mov x15, #4222124650659840 -# CHECK-NEXT: f2e001f0 movk x16, #15, lsl #48 +# CHECK-NEXT: movk x12, #12 +# CHECK-NEXT: movk x12, #12 +# CHECK-NEXT: movk x13, #13, lsl #16 +# CHECK-NEXT: movk x13, #13, lsl #16 +# CHECK-NEXT: movk x14, #14, lsl #32 +# CHECK-NEXT: movk x14, #14, lsl #32 +# CHECK-NEXT: mov x15, #4222124650659840 +# CHECK-NEXT: movk x16, #15, lsl #48 .section .R_AARCH64_MOVW_SABS,"ax",@progbits movz x1, #:abs_g0_s:zero+1 @@ -199,15 +174,15 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_SABS: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: d2800021 mov x1, #1 -# CHECK-NEXT: 92800001 mov x1, #-1 -# CHECK-NEXT: d2a00042 mov x2, #131072 +# CHECK-NEXT: mov x1, #1 +# CHECK-NEXT: mov x1, #-1 +# CHECK-NEXT: mov x2, #131072 ## -65537 = 0xfffffffffffeffff -# CHECK-NEXT: 92a00022 mov x2, #-65537 +# CHECK-NEXT: mov x2, #-65537 ## 12884901888 = 0x300000000 -# CHECK-NEXT: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #12884901888 ## -8589934593 = #0xfffffffdffffffff -# CHECK-NEXT: 92c00043 mov x3, #-8589934593 +# CHECK-NEXT: mov x3, #-8589934593 .section .R_AARCH64_MOVW_PREL,"ax",@progbits movz x1, #:prel_g0:.+1 @@ -231,24 +206,24 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_PREL: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: 2101bc: d2800021 mov x1, #1 -# CHECK-NEXT: 2101c0: 92800001 mov x1, #-1 -# CHECK-NEXT: 2101c4: f2800021 movk x1, #1 -# CHECK-NEXT: 2101c8: f29fffe1 movk x1, #65535 -# CHECK-NEXT: 2101cc: d2a00042 mov x2, #131072 +# CHECK-NEXT: mov x1, #1 +# CHECK-NEXT: mov x1, #-1 +# CHECK-NEXT: movk x1, #1 +# CHECK-NEXT: movk x1, #65535 +# CHECK-NEXT: mov x2, #131072 ## -65537 = 0xfffffffffffeffff -# CHECK-NEXT: 2101d0: 92a00022 mov x2, #-65537 -# CHECK-NEXT: 2101d4: f2a00042 movk x2, #2, lsl #16 -# CHECK-NEXT: 2101d8: f2bfffc2 movk x2, #65534, lsl #16 +# CHECK-NEXT: mov x2, #-65537 +# CHECK-NEXT: movk x2, #2, lsl #16 +# CHECK-NEXT: movk x2, #65534, lsl #16 ## 12884901888 = 0x300000000 -# CHECK-NEXT: 2101dc: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #12884901888 ## -8589934593 = #0xfffffffdffffffff -# CHECK-NEXT: 2101e0: 92c00043 mov x3, #-8589934593 -# CHECK-NEXT: 2101e4: f2c00063 movk x3, #3, lsl #32 -# CHECK-NEXT: 2101e8: f2dfffa3 movk x3, #65533, lsl #32 -# CHECK-NEXT: 2101ec: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #-8589934593 +# CHECK-NEXT: movk x3, #3, lsl #32 +# CHECK-NEXT: movk x3, #65533, lsl #32 +# CHECK-NEXT: mov x3, #12884901888 ## 1125899906842624 = 0x4000000000000 -# CHECK-NEXT: 2101f0: d2e00084 mov x4, #1125899906842624 -# CHECK-NEXT: 2101f4: d2ffff84 mov x4, #-1125899906842624 -# CHECK-NEXT: 2101f8: f2e00084 movk x4, #4, lsl #48 -# CHECK-NEXT: 2101fc: f2ffff84 movk x4, #65532, lsl #48 +# CHECK-NEXT: mov x4, #1125899906842624 +# CHECK-NEXT: mov x4, #-1125899906842624 +# CHECK-NEXT: movk x4, #4, lsl #48 +# CHECK-NEXT: movk x4, #65532, lsl #48 diff --git a/lld/test/ELF/allow-shlib-undefined-weak.s b/lld/test/ELF/allow-shlib-undefined-weak.s index 1037cbed0d859..141881fd73673 100644 --- a/lld/test/ELF/allow-shlib-undefined-weak.s +++ b/lld/test/ELF/allow-shlib-undefined-weak.s @@ -21,7 +21,7 @@ # RUN: ld.lld -shared wrap.o def.so -o wrap.so # RUN: llvm-mc -filetype=obj -triple=x86_64 start.s -o start.o -# RUN: ld.lld --no-allow-shlib-undefined start.o wrap.so ref.so -o /dev/null 2>&1 | count 0 +# RUN: ld.lld --no-allow-shlib-undefined start.o wrap.so ref.so 2>&1 | count 0 #--- start.s .globl _start diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s index c69c1ea20ce3b..a088c2595d538 100644 --- a/lld/test/ELF/allow-shlib-undefined.s +++ b/lld/test/ELF/allow-shlib-undefined.s @@ -9,40 +9,40 @@ # RUN: cp a.so b.so # RUN: llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o && ld.lld -shared empty.o -o empty.so -# RUN: ld.lld --allow-shlib-undefined main.o a.so -o /dev/null -# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --allow-shlib-undefined main.o a.so +# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so 2>&1 | FileCheck %s ## Executable linking defaults to --no-allow-shlib-undefined. -# RUN: not ld.lld main.o a.so -o /dev/null 2>&1 | FileCheck %s -# RUN: ld.lld main.o a.so --noinhibit-exec -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN -# RUN: ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN +# RUN: not ld.lld main.o a.so 2>&1 | FileCheck %s +# RUN: ld.lld main.o a.so --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=WARN +# RUN: ld.lld main.o a.so --warn-unresolved-symbols 2>&1 | FileCheck %s --check-prefix=WARN ## -shared linking defaults to --allow-shlib-undefined. -# RUN: ld.lld -shared main.o a.so -o /dev/null +# RUN: ld.lld -shared main.o a.so ## DSO with undefines should link with or without any of these options. -# RUN: ld.lld -shared --allow-shlib-undefined a.o -o /dev/null -# RUN: ld.lld -shared --no-allow-shlib-undefined a.o -o /dev/null +# RUN: ld.lld -shared --allow-shlib-undefined a.o +# RUN: ld.lld -shared --no-allow-shlib-undefined a.o ## Perform checking even if an unresolved symbol is first seen in a regular object file. -# RUN: not ld.lld --gc-sections main.o ref.o a.so -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld --gc-sections main.o ref.o a.so 2>&1 | FileCheck %s ## Check that the error is reported for each shared library where the symbol ## is referenced. -# RUN: not ld.lld main.o a.so empty.so b.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2 +# RUN: not ld.lld main.o a.so empty.so b.so 2>&1 | FileCheck %s --check-prefix=CHECK2 ## Test some cases when a relocatable object file provides a non-exported definition. -# RUN: not ld.lld main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o def-hidden.o a.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings -o /dev/null +# RUN: not ld.lld main.o a.so def-hidden.o 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings ## Test a relocatable object file definition that is converted to STB_LOCAL. -# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver 2>&1 | FileCheck %s --check-prefix=NONEXPORTED ## The section containing the definition is discarded, and we report an error. -# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o 2>&1 | FileCheck %s ## The definition def.so is ignored. # RUN: ld.lld -shared def.o -o def.so -# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings -o /dev/null +# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings # CHECK-NOT: error: # CHECK: error: undefined reference: x1{{$}} diff --git a/lld/test/ELF/arm-cmse-diagnostics.s b/lld/test/ELF/arm-cmse-diagnostics.s index d30f2431cc57a..4c8a4097e8250 100644 --- a/lld/test/ELF/arm-cmse-diagnostics.s +++ b/lld/test/ELF/arm-cmse-diagnostics.s @@ -7,11 +7,11 @@ // RUN: llvm-mc -arm-add-build-attributes -filetype=obj --triple=thumbv8m.base lib -o lib.o // RUN: llvm-mc -arm-add-build-attributes -filetype=obj --triple=thumbv8m.base app -I %S/Inputs -o app.o // RUN: llvm-objcopy --redefine-sym=entry7_duplicate=entry6_duplicate lib.o -// RUN: not ld.lld --cmse-implib --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IMPLIB -// RUN: not ld.lld --cmse-implib --in-implib=lib.o --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_MULT_INIMPLIB -// RUN: not ld.lld --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB -// RUN: not ld.lld --out-implib=out.lib app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_OUT_IMPLIB -// RUN: not ld.lld --out-implib=out.lib --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB,ERR_OUT_IMPLIB +// RUN: not ld.lld --cmse-implib --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IMPLIB +// RUN: not ld.lld --cmse-implib --in-implib=lib.o --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_MULT_INIMPLIB +// RUN: not ld.lld --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB +// RUN: not ld.lld --out-implib=out.lib app.o 2>&1 | FileCheck %s --check-prefixes=ERR_OUT_IMPLIB +// RUN: not ld.lld --out-implib=out.lib --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB,ERR_OUT_IMPLIB // ERR_IMPLIB: error: CMSE symbol 'entry_not_external' in import library '{{.*}}' is not global // ERR_IMPLIB: error: CMSE symbol 'entry_not_absolute' in import library '{{.*}}' is not absolute @@ -91,7 +91,7 @@ /// Test diagnostics emitted during symbol attribute checks. // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base symattr -o symattr.o -// RUN: not ld.lld --cmse-implib symattr.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_SYMATTR +// RUN: not ld.lld --cmse-implib symattr.o 2>&1 | FileCheck %s --check-prefixes=ERR_SYMATTR // ERR_SYMATTR-NOT: __acle_se_valid_{{.*}} // ERR_SYMATTR: error: {{.*}}: cmse special symbol '__acle_se_invalid_1' is not a Thumb function definition @@ -110,9 +110,9 @@ /// Test diagnostics emitted when a symbol is removed from a later version of the import library. // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base libv1 -o libv1.o // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base libv2 -o libv2.o -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --out-implib=libv1.lib -o /dev/null -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv2.o --in-implib=libv1.lib --out-implib=libv2.lib -o /dev/null 2>&1 | FileCheck %s --check-prefixes=WARN_MISSING -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --in-implib=libv2.lib -o /dev/null 2>&1 | FileCheck %s --check-prefixes=WARN_NEWENTRY +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --out-implib=libv1.lib +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv2.o --in-implib=libv1.lib --out-implib=libv2.lib 2>&1 | FileCheck %s --check-prefixes=WARN_MISSING +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --in-implib=libv2.lib 2>&1 | FileCheck %s --check-prefixes=WARN_NEWENTRY // WARN_MISSING: warning: entry function 'bar' from CMSE import library is not present in secure application // WARN_NEWENTRY: warning: new entry function 'bar' introduced but no output import library specified diff --git a/lld/test/ELF/avr-reloc-error.s b/lld/test/ELF/avr-reloc-error.s index f177e44f753fa..b36a24d764c5c 100644 --- a/lld/test/ELF/avr-reloc-error.s +++ b/lld/test/ELF/avr-reloc-error.s @@ -3,13 +3,13 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-7.s -o avr-pcrel-7.o -# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \ +# RUN: not ld.lld avr-pcrel-7.o -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \ # RUN: FileCheck %s --check-prefix=PCREL7 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-13.s -o avr-pcrel-13.o -# RUN: not ld.lld avr-pcrel-13.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \ +# RUN: not ld.lld avr-pcrel-13.o -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \ # RUN: FileCheck %s --check-prefix=PCREL13 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-abs.s -o avr-abs.o -# RUN: not ld.lld avr-abs.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1009 --defsym=callee1=0x1010 2>&1 | \ +# RUN: not ld.lld avr-abs.o -Ttext=0x1000 --defsym=callee0=0x1009 --defsym=callee1=0x1010 2>&1 | \ # RUN: FileCheck %s --check-prefix=ABS #--- avr-pcrel-7.s diff --git a/lld/test/ELF/common-archive-lookup.s b/lld/test/ELF/common-archive-lookup.s index a30d0f18d01ad..9834d13ed7c24 100644 --- a/lld/test/ELF/common-archive-lookup.s +++ b/lld/test/ELF/common-archive-lookup.s @@ -69,7 +69,7 @@ # RUN: FileCheck --check-prefix=ASM %s < out.lto.s ## COMMON overrides weak. Don't extract 3.bc which provides a weak definition. -# RUN: ld.lld -o /dev/null main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s +# RUN: ld.lld main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s ## Old FORTRAN that mixes use of COMMON blocks and BLOCK DATA requires that we ## search through archives for non-tentative definitions (from the BLOCK DATA) diff --git a/lld/test/ELF/duplicated-synthetic-sym.s b/lld/test/ELF/duplicated-synthetic-sym.s index d08af3a1a52e5..9d47ec10f797f 100644 --- a/lld/test/ELF/duplicated-synthetic-sym.s +++ b/lld/test/ELF/duplicated-synthetic-sym.s @@ -1,14 +1,12 @@ // REQUIRES: x86 -// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: rm -rf %t.dir -// RUN: mkdir %t.dir -// RUN: cd %t.dir +// RUN: rm -rf %t && mkdir %t && cd %t +// RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o // RUN: echo > file.bin -// RUN: not ld.lld %t.o --format=binary file.bin -o /dev/null 2>&1 | FileCheck %s -// RUN: not ld.lld %t.o --format binary file.bin -o /dev/null 2>&1 | FileCheck %s +// RUN: not ld.lld a.o --format=binary file.bin 2>&1 | FileCheck %s +// RUN: not ld.lld a.o --format binary file.bin 2>&1 | FileCheck %s -// CHECK: duplicate symbol: _binary_file_bin_start +// CHECK: error: duplicate symbol: _binary_file_bin_start // CHECK-NEXT: defined in {{.*}}.o // CHECK-NEXT: defined in file.bin diff --git a/lld/test/ELF/input-section-flags.s b/lld/test/ELF/input-section-flags.s index 0c8e31c77b0dc..f848d55e6fddc 100644 --- a/lld/test/ELF/input-section-flags.s +++ b/lld/test/ELF/input-section-flags.s @@ -15,6 +15,7 @@ # RUN: .outsec3 : { INPUT_SECTION_FLAGS(SHF_WRITE) *(.sec.*) } \ # RUN: .outsec4 : { INPUT_SECTION_FLAGS(SHF_MERGE & !SHF_STRINGS) *(.sec.*) } \ # RUN: .outsec5 : { INPUT_SECTION_FLAGS(SHF_STRINGS) *(.sec.*) } \ +# RUN: .outsec6 : { INPUT_SECTION_FLAGS(!SHF_TLS & !SHF_EXCLUDE & !SHF_COMPRESSED & !SHF_ARM_PURECODE) *(.sec.*) } \ # RUN: } " > %t.script # RUN: ld.lld -o %t1 --script %t.script %t.o # RUN: llvm-readobj --symbols %t1 | FileCheck %s diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s index 0bbebac59bb34..d6dd8a5347e94 100644 --- a/lld/test/ELF/linkerscript/discard-section.s +++ b/lld/test/ELF/linkerscript/discard-section.s @@ -4,8 +4,8 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o -# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -z undefs -o /dev/null 2>&1 | FileCheck %s --check-prefix=LOCAL --implicit-check-not=error: -# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=LOCAL,NONLOCAL --implicit-check-not=error: +# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -z undefs 2>&1 | FileCheck %s --check-prefix=LOCAL --implicit-check-not=error: +# RUN: not ld.lld --threads=1 -T a.lds a.o b.o 2>&1 | FileCheck %s --check-prefixes=LOCAL,NONLOCAL --implicit-check-not=error: # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning: # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC diff --git a/lld/test/ELF/linkerscript/input-relative.s b/lld/test/ELF/linkerscript/input-relative.s index 771684c7c4f82..3f81c5f3ee9e3 100644 --- a/lld/test/ELF/linkerscript/input-relative.s +++ b/lld/test/ELF/linkerscript/input-relative.s @@ -31,13 +31,13 @@ ## The rules does not apply to an absolute path. # RUN: echo 'INPUT(/libb.a)' > dir/absolute.lds -# RUN: not ld.lld a.o dir/absolute.lds -o /dev/null +# RUN: not ld.lld a.o dir/absolute.lds ## If the parent directory of the current linker script does not contain the file, ## fall back to the current working directory. # RUN: cp libb.a libc.a # RUN: echo 'INPUT(libc.a)' > dir/fallback.lds -# RUN: ld.lld a.o dir/fallback.lds -o /dev/null +# RUN: ld.lld a.o dir/fallback.lds .globl _start _start: diff --git a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test index c82a93efc1aae..7a18015cfcab4 100644 --- a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test +++ b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test @@ -5,7 +5,7 @@ ## If we don't merge adjacent duplicate entries, __code_size will be negative and ## . += __code_size will trigger a "move location counter backward" error. ## LLD may report more errors further down, but there is only one "move location counter backward" error. -# RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o -o /dev/null --no-merge-exidx-entries 2>&1 | \ +# RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o --no-merge-exidx-entries 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR --implicit-check-not=error: # ERR: error: a.t:9: unable to move location counter (0x1000) backward to 0xf6c for section 'dummy1' diff --git a/lld/test/ELF/lto/archive-mixed.test b/lld/test/ELF/lto/archive-mixed.test index fbb84a1d8bb76..6f1db87c89ca1 100644 --- a/lld/test/ELF/lto/archive-mixed.test +++ b/lld/test/ELF/lto/archive-mixed.test @@ -19,22 +19,22 @@ ; RUN: llvm-ar rc other.bc.a a.bc ; RUN: llvm-ar rc other.o.a a.o -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.bc.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.bc.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.bc.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.bc.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.o.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.o.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.o.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.o.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.bc.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.o.b.bc.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.bc.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.o.b.bc.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.o.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.o.b.o.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} ;; Not an LTO test case, but here for completeness. -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.o.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.o.b.o.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} ; CHECK: ref.o diff --git a/lld/test/ELF/lto/obj-path.ll b/lld/test/ELF/lto/obj-path.ll index c0bb4addf2466..bf291ff8a0458 100644 --- a/lld/test/ELF/lto/obj-path.ll +++ b/lld/test/ELF/lto/obj-path.ll @@ -54,14 +54,14 @@ ;; With --thinlto-index-only, --lto-obj-path= creates just one file. ; RUN: rm -f objpath.o objpath.o1 objpath.o2 -; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null +; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc ; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY ; RUN: not ls objpath.o1 ; RUN: not ls objpath.o2 ;; Test --plugin-opt=obj-path=. ; RUN: rm -f objpath.o -; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null +; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc ; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY ;; Ensure lld emits empty combined module if specific obj-path. diff --git a/lld/test/ELF/lto/parallel.ll b/lld/test/ELF/lto/parallel.ll index 6b2c352b0a965..e32225c3ed3b8 100644 --- a/lld/test/ELF/lto/parallel.ll +++ b/lld/test/ELF/lto/parallel.ll @@ -5,7 +5,7 @@ ; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-nm out.lto.1.o | FileCheck --check-prefix=CHECK1 %s -; RUN: not ld.lld --lto-partitions=0 a.bc -o /dev/null 2>&1 | FileCheck --check-prefix=INVALID %s +; RUN: not ld.lld --lto-partitions=0 a.bc 2>&1 | FileCheck --check-prefix=INVALID %s ; INVALID: --lto-partitions: number of threads must be > 0 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/lld/test/ELF/non-abs-reloc.s b/lld/test/ELF/non-abs-reloc.s index 42b5f8fec1c43..e37a0ec12414b 100644 --- a/lld/test/ELF/non-abs-reloc.s +++ b/lld/test/ELF/non-abs-reloc.s @@ -15,13 +15,13 @@ // DISASM-NEXT: 6: call{{.}} 0x5 /// There is currently no error for -r. See also https://github.com/ClangBuiltLinux/linux/issues/1937 -// RUN: ld.lld -T lds -r a.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=REL-R --implicit-check-not=warning: +// RUN: ld.lld -T lds -r a.o 2>&1 | FileCheck %s --check-prefix=REL-R --implicit-check-not=warning: // REL-R: warning: {{.*}}:(.nonalloc1+0xa): has non-ABS relocation R_386_PC32 against symbol '' // RUN: llvm-mc -filetype=obj -triple=x86_64 asm -o b.o // RUN: ld.lld -T lds b.o -o b 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=warning: // RUN: llvm-objdump -D --no-show-raw-insn b | FileCheck --check-prefix=DISASM %s -// RUN: ld.lld -T lds -r b.o -o /dev/null --fatal-warnings +// RUN: ld.lld -T lds -r b.o --fatal-warnings // CHECK2: warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_X86_64_PC32 against symbol '_start' // CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_X86_64_PC32 against symbol 'ifunc' // CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0xa): has non-ABS relocation R_X86_64_PC32 against symbol '' diff --git a/lld/test/ELF/print-archive-stats.s b/lld/test/ELF/print-archive-stats.s index 2dd236f8e0a1f..5116685623ce2 100644 --- a/lld/test/ELF/print-archive-stats.s +++ b/lld/test/ELF/print-archive-stats.s @@ -10,7 +10,7 @@ # RUN: llvm-ar rc 1.a 1.o 2.o 3.o # RUN: llvm-ar rc lib2.a -# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=a.txt -o /dev/null +# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=a.txt # RUN: FileCheck --input-file=a.txt -DT=%t %s --match-full-lines --strict-whitespace ## Fetches 0 member from %t/weak.a and 2 members from %t1.a @@ -20,10 +20,10 @@ # CHECK-NEXT:0 0 {{.*}}lib2.a ## - means stdout. -# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=- -o /dev/null | diff a.txt - +# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=- | diff a.txt - ## The second 1.a has 0 fetched member. -# RUN: ld.lld a.o %t/weak.a -L. -l:1.a -l:1.a --print-archive-stats=- -o /dev/null | \ +# RUN: ld.lld a.o %t/weak.a -L. -l:1.a -l:1.a --print-archive-stats=- | \ # RUN: FileCheck --check-prefix=CHECK2 %s # CHECK2: members extracted archive # CHECK2-NEXT: 1 0 {{.*}}weak.a @@ -31,7 +31,7 @@ # CHECK2-NEXT: 3 0 {{.*}}1.a # CHECK2-NEXT: 0 0 {{.*}}lib2.a -# RUN: not ld.lld -shared a.o -L. --print-archive-stats=/ -o /dev/null 2>&1 | FileCheck --check-prefix=ERR %s +# RUN: not ld.lld -shared a.o -L. --print-archive-stats=/ 2>&1 | FileCheck --check-prefix=ERR %s # ERR: error: --print-archive-stats=: cannot open /: {{.*}} #--- a.s diff --git a/lld/test/ELF/remap-inputs.test b/lld/test/ELF/remap-inputs.test index 0f9cafa987ac9..1be01c792a37b 100644 --- a/lld/test/ELF/remap-inputs.test +++ b/lld/test/ELF/remap-inputs.test @@ -17,26 +17,26 @@ # REPRO-NEXT: d.so ## --remap-inputs can also be specified multiple times. -# RUN: ld.lld --remap-inputs 'aa.o=a.o' --remap-inputs='d[d].so=d.so' aa.o b.o c.a d.so -o /dev/null +# RUN: ld.lld --remap-inputs 'aa.o=a.o' --remap-inputs='d[d].so=d.so' aa.o b.o c.a d.so ## A multiple-to-one pattern may easily cause issues. Users should be careful. -# RUN: not ld.lld --remap-inputs-file=3.map aa.o bb.bc -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=3.map aa.o bb.bc 2>&1 | \ # RUN: FileCheck %s --check-prefix=DUPLICATE --implicit-check-not=error: # DUPLICATE: error: duplicate symbol: _start -# RUN: not ld.lld --remap-inputs-file=err1.map aa.o bb.bc -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=err1.map aa.o bb.bc 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR1 --implicit-check-not=error: # ERR1: error: err1.map:2: parse error, not 'from-glob=to-file' # ERR1-NEXT: error: cannot open bb.bc: {{.*}} -# RUN: not ld.lld --remap-inputs-file=err2.map aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=err2.map aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR2 --implicit-check-not=error: # ERR2: error: err2.map:1: invalid glob pattern, unmatched '[': aa.[o # ERR2-NEXT: error: cannot open aa.o: {{.*}} -# RUN: not ld.lld --remap-inputs=aa.o aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs=aa.o aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 --implicit-check-not=error: -# RUN: not ld.lld --remap-inputs=aa.o= aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs=aa.o= aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 --implicit-check-not=error: # ERR3: error: --remap-inputs: parse error, not 'from-glob=to-file' # ERR3-NEXT: error: cannot open aa.o: {{.*}} diff --git a/lld/test/ELF/reproduce-deplibs.s b/lld/test/ELF/reproduce-deplibs.s index 06c25a2239834..48486d0e2bde7 100644 --- a/lld/test/ELF/reproduce-deplibs.s +++ b/lld/test/ELF/reproduce-deplibs.s @@ -8,7 +8,7 @@ # RUN: llvm-ar rc foo.a foo.o # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o bar.o -# RUN: ld.lld bar.o -o /dev/null --reproduce repro.tar +# RUN: ld.lld bar.o --reproduce repro.tar # RUN: tar tf repro.tar | FileCheck -DPATH='%:t.dir' %s # CHECK: [[PATH]]/foo.a diff --git a/lld/test/ELF/reproduce-lto.s b/lld/test/ELF/reproduce-lto.s index 36838f21388ef..b1a5bab122c56 100644 --- a/lld/test/ELF/reproduce-lto.s +++ b/lld/test/ELF/reproduce-lto.s @@ -5,10 +5,10 @@ # RUN: rm -rf %t.dir # RUN: mkdir -p %t.dir/build1 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.dir/build1/foo.o -# RUN: echo > %t.dir/build1/empty_profile.txt # RUN: cd %t.dir -# RUN: ld.lld build1/foo.o -o /dev/null --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o build1/foo.o +# RUN: echo > build1/empty_profile.txt +# RUN: ld.lld build1/foo.o --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt # RUN: tar tvf repro1.tar | FileCheck %s --implicit-check-not={{.}} # CHECK-DAG: {{.*}} repro1/{{.*}}/empty_profile.txt diff --git a/lld/test/ELF/riscv-attributes.s b/lld/test/ELF/riscv-attributes.s index d003a298101cb..13b2c7a24d0b8 100644 --- a/lld/test/ELF/riscv-attributes.s +++ b/lld/test/ELF/riscv-attributes.s @@ -31,7 +31,7 @@ # RUN: llvm-readobj --arch-specific out3 | FileCheck %s --check-prefix=CHECK3 # RUN: llvm-mc -filetype=obj -triple=riscv64 invalid_arch1.s -o invalid_arch1.o -# RUN: not ld.lld invalid_arch1.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID_ARCH1 --implicit-check-not=error: +# RUN: not ld.lld invalid_arch1.o 2>&1 | FileCheck %s --check-prefix=INVALID_ARCH1 --implicit-check-not=error: # INVALID_ARCH1: error: invalid_arch1.o:(.riscv.attributes): rv64i2: extension lacks version in expected format ## A zero value attribute is not printed. @@ -41,20 +41,20 @@ ## Differing stack_align values lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 diff_stack_align.s -o diff_stack_align.o -# RUN: not ld.lld a.o b.o c.o diff_stack_align.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=STACK_ALIGN --implicit-check-not=error: +# RUN: not ld.lld a.o b.o c.o diff_stack_align.o 2>&1 | FileCheck %s --check-prefix=STACK_ALIGN --implicit-check-not=error: # STACK_ALIGN: error: diff_stack_align.o:(.riscv.attributes) has stack_align=32 but a.o:(.riscv.attributes) has stack_align=16 ## RISC-V tag merging for atomic_abi values A6C and A7 lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_A6C.s -o atomic_abi_A6C.o # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_A7.s -o atomic_abi_A7.o -# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_A7.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_ERROR --implicit-check-not=error: +# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_A7.o 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_ERROR --implicit-check-not=error: # ATOMIC_ABI_ERROR: error: atomic abi mismatch for .riscv.attributes # ATOMIC_ABI_ERROR-NEXT: >>> atomic_abi_A6C.o:(.riscv.attributes): atomic_abi=1 # ATOMIC_ABI_ERROR-NEXT: >>> atomic_abi_A7.o:(.riscv.attributes): atomic_abi=3 ## RISC-V tag merging for atomic_abi values A6C and invalid lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_invalid.s -o atomic_abi_invalid.o -# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_invalid.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_INVALID --implicit-check-not=error: +# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_invalid.o 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_INVALID --implicit-check-not=error: # ATOMIC_ABI_INVALID: error: unknown atomic abi for .riscv.attributes # ATOMIC_ABI_INVALID-NEXT: >>> atomic_abi_invalid.o:(.riscv.attributes): atomic_abi=42 diff --git a/lld/test/ELF/unknown-section.test b/lld/test/ELF/unknown-section.test index f6ecca29a22ae..faf420e1fb5c4 100644 --- a/lld/test/ELF/unknown-section.test +++ b/lld/test/ELF/unknown-section.test @@ -1,6 +1,6 @@ # RUN: rm -rf %t && mkdir %t && cd %t # RUN: yaml2obj %s -o a.o -# RUN: not ld.lld a.o -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: +# RUN: not ld.lld a.o 2>&1 | FileCheck %s --implicit-check-not=error: # CHECK: error: a.o:(relr): unknown section type 0x13 # CHECK-NEXT: error: a.o:(regular): unknown section type 0x15 diff --git a/lld/test/ELF/why-extract.s b/lld/test/ELF/why-extract.s index a41db8d9fd49a..3235bce5a7167 100644 --- a/lld/test/ELF/why-extract.s +++ b/lld/test/ELF/why-extract.s @@ -12,18 +12,18 @@ # RUN: cd %t ## Nothing is extracted from an archive. The file is created with just a header. -# RUN: ld.lld main.o a.o b.a -o /dev/null --why-extract=why1.txt +# RUN: ld.lld main.o a.o b.a --why-extract=why1.txt # RUN: FileCheck %s --input-file=why1.txt --check-prefix=CHECK1 --match-full-lines --strict-whitespace # CHECK1:reference extracted symbol # CHECK1-NOT:{{.}} ## Some archive members are extracted. -# RUN: ld.lld main.o a_b.a b.a -o /dev/null --why-extract=why2.txt +# RUN: ld.lld main.o a_b.a b.a --why-extract=why2.txt # RUN: FileCheck %s --input-file=why2.txt --check-prefix=CHECK2 --match-full-lines --strict-whitespace ## A relocation error does not suppress the output. -# RUN: rm -f why2.txt && not ld.lld main.o a_b.a b.a err.o -o /dev/null --why-extract=why2.txt +# RUN: rm -f why2.txt && not ld.lld main.o a_b.a b.a err.o --why-extract=why2.txt # RUN: FileCheck %s --input-file=why2.txt --check-prefix=CHECK2 --match-full-lines --strict-whitespace # CHECK2:reference extracted symbol @@ -31,12 +31,12 @@ # CHECK2-NEXT:a_b.a(a_b.o) b.a(b.o) b() ## An undefined symbol error does not suppress the output. -# RUN: not ld.lld main.o a_b.a -o /dev/null --why-extract=why3.txt +# RUN: not ld.lld main.o a_b.a --why-extract=why3.txt # RUN: FileCheck %s --input-file=why3.txt --check-prefix=CHECK3 --match-full-lines --strict-whitespace ## Check that backward references are supported. ## - means stdout. -# RUN: ld.lld b.a a_b.a main.o -o /dev/null --why-extract=- | FileCheck %s --check-prefix=CHECK4 +# RUN: ld.lld b.a a_b.a main.o --why-extract=- | FileCheck %s --check-prefix=CHECK4 # CHECK3:reference extracted symbol # CHECK3-NEXT:main.o a_b.a(a_b.o) a @@ -45,34 +45,34 @@ # CHECK4-NEXT:a_b.a(a_b.o) b.a(b.o) b() # CHECK4-NEXT:main.o a_b.a(a_b.o) a -# RUN: ld.lld main.o a_b.a b.a -o /dev/null --no-demangle --why-extract=- | FileCheck %s --check-prefix=MANGLED +# RUN: ld.lld main.o a_b.a b.a --no-demangle --why-extract=- | FileCheck %s --check-prefix=MANGLED # MANGLED: a_b.a(a_b.o) b.a(b.o) _Z1bv -# RUN: ld.lld main.o a.a b.a -o /dev/null -u _Z1bv --why-extract=- | FileCheck %s --check-prefix=UNDEFINED +# RUN: ld.lld main.o a.a b.a -u _Z1bv --why-extract=- | FileCheck %s --check-prefix=UNDEFINED ## We insert -u symbol before processing other files, so its name is . ## This is not ideal. # UNDEFINED: b.a(b.o) b() -# RUN: ld.lld main.o a.a b.a -o /dev/null --undefined-glob '_Z1b*' --why-extract=- | FileCheck %s --check-prefix=UNDEFINED_GLOB +# RUN: ld.lld main.o a.a b.a --undefined-glob '_Z1b*' --why-extract=- | FileCheck %s --check-prefix=UNDEFINED_GLOB # UNDEFINED_GLOB: --undefined-glob b.a(b.o) b() -# RUN: ld.lld main.o a.a b.a -o /dev/null -e _Z1bv --why-extract=- | FileCheck %s --check-prefix=ENTRY +# RUN: ld.lld main.o a.a b.a -e _Z1bv --why-extract=- | FileCheck %s --check-prefix=ENTRY # ENTRY: --entry b.a(b.o) b() -# RUN: ld.lld main.o b.a -o /dev/null -T a.lds --why-extract=- | FileCheck %s --check-prefix=SCRIPT +# RUN: ld.lld main.o b.a -T a.lds --why-extract=- | FileCheck %s --check-prefix=SCRIPT # SCRIPT: b.a(b.o) b() -# RUN: ld.lld main.o --start-lib a_b.o b.o --end-lib -o /dev/null --why-extract=- | FileCheck %s --check-prefix=LAZY +# RUN: ld.lld main.o --start-lib a_b.o b.o --end-lib --why-extract=- | FileCheck %s --check-prefix=LAZY # LAZY: main.o a_b.o a # LAZY: a_b.o b.o b() -# RUN: not ld.lld -shared main.o -o /dev/null --why-extract=/ 2>&1 | FileCheck %s --check-prefix=ERR +# RUN: not ld.lld -shared main.o --why-extract=/ 2>&1 | FileCheck %s --check-prefix=ERR # ERR: error: cannot open --why-extract= file /: {{.*}} diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll index 41868a0b2b50b..79f1d384919d9 100644 --- a/lld/test/wasm/data-segments.ll +++ b/lld/test/wasm/data-segments.ll @@ -6,36 +6,36 @@ ; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.pic.o -relocation-model=pic -mattr=+atomics,+bulk-memory,+mutable-globals ; RUN: llc --mtriple=wasm64-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.pic-mem64.o -relocation-model=pic -mattr=+atomics,+bulk-memory,+mutable-globals -; atomics, shared memory => error +;; atomics, shared memory => error ; RUN: not wasm-ld -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.o -o %t.atomics.wasm 2>&1 | FileCheck %s --check-prefix ERROR -; bulk memory, unshared memory => active segments +;; bulk memory, unshared memory => active segments ; RUN: wasm-ld -no-gc-sections --no-entry %t.bulk-mem.o -o %t.bulk-mem.wasm ; RUN: obj2yaml %t.bulk-mem.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE32 -; bulk memory, unshared memory, wasm64 => active segments +;; bulk memory, unshared memory, wasm64 => active segments ; RUN: wasm-ld -mwasm64 -no-gc-sections --no-entry %t.bulk-mem64.o -o %t.bulk-mem64.wasm ; RUN: obj2yaml %t.bulk-mem64.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE64 -; atomics, bulk memory, shared memory => passive segments +;; atomics, bulk memory, shared memory => passive segments ; RUN: wasm-ld -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.o -o %t.atomics.bulk-mem.wasm ; RUN: obj2yaml %t.atomics.bulk-mem.wasm | FileCheck %s --check-prefix PASSIVE ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i32 -; atomics, bulk memory, shared memory, wasm64 => passive segments +;; atomics, bulk memory, shared memory, wasm64 => passive segments ; RUN: wasm-ld -mwasm64 -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem64.o -o %t.atomics.bulk-mem64.wasm ; RUN: obj2yaml %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefix PASSIVE ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i64 -; Also test in combination with PIC/pie +;; Also test in combination with PIC/pie ; RUN: wasm-ld --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic.o -o %t.pic.wasm ; RUN: obj2yaml %t.pic.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE32-PIC -; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 +; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 -; Also test in combination with PIC/pie + wasm64 +;; Also test in combination with PIC/pie + wasm64 ; RUN: wasm-ld -mwasm64 --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic-mem64.o -o %t.pic-mem64.wasm ; RUN: obj2yaml %t.pic-mem64.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE64-PIC -; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 +; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 @a = hidden global [6 x i8] c"hello\00", align 1 @b = hidden global [8 x i8] c"goodbye\00", align 1 @@ -151,7 +151,7 @@ ; PASSIVE-PIC-NEXT: - Index: 2 ; PASSIVE-PIC-NEXT: Name: __wasm_init_memory -; no data relocations. +;; no data relocations. ; DIS-LABEL: <__wasm_call_ctors>: ; DIS-EMPTY: ; DIS-NEXT: end diff --git a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll new file mode 100644 index 0000000000000..271bdae11e49d --- /dev/null +++ b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll @@ -0,0 +1,6 @@ +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-emscripten" + +define ptr @emscripten_return_address() { + ret ptr null +} diff --git a/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll b/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll deleted file mode 100644 index 1439d7f8b4cb4..0000000000000 --- a/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll +++ /dev/null @@ -1,6 +0,0 @@ -target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" -target triple = "wasm32-unknown-unknown" - -define half @__truncsfhf2(float) { - ret half 0.0 -} diff --git a/lld/test/wasm/lto/libcall-return-addr.ll b/lld/test/wasm/lto/libcall-return-addr.ll new file mode 100644 index 0000000000000..74eba74f97018 --- /dev/null +++ b/lld/test/wasm/lto/libcall-return-addr.ll @@ -0,0 +1,18 @@ +; RUN: llvm-as %s -o %t.o +; RUN: llvm-as %p/Inputs/libcall-return-addr.ll -o %t.return-addr.o +; RUN: rm -f %t.a +; RUN: llvm-ar rcs %t.a %t.return-addr.o +; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-emscripten" + +@g_ptr = global ptr null + +define void @_start() { + %addr = call ptr @llvm.returnaddress(i32 1) + store ptr %addr, ptr @g_ptr + ret void +} + +; CHECK: wasm-ld: error: {{.*}}return-addr.o): attempt to add bitcode file after LTO (emscripten_return_address) diff --git a/lld/test/wasm/lto/libcall-truncsfhf2.ll b/lld/test/wasm/lto/libcall-truncsfhf2.ll deleted file mode 100644 index fd07bb53890f6..0000000000000 --- a/lld/test/wasm/lto/libcall-truncsfhf2.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llvm-as %s -o %t.o -; RUN: llvm-as %p/Inputs/libcall-truncsfhf2.ll -o %t.truncsfhf2.o -; RUN: rm -f %t.a -; RUN: llvm-ar rcs %t.a %t.truncsfhf2.o -; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s - -target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" -target triple = "wasm32-unknown-unknown" - -@g_float = global float 0.0 -@g_half = global half 0.0 - -define void @_start() { - %val1 = load float, ptr @g_float - %v0 = fptrunc float %val1 to half - store half %v0, ptr @g_half - ret void -} - -; CHECK: wasm-ld: error: {{.*}}truncsfhf2.o): attempt to add bitcode file after LTO (__truncsfhf2) diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index d7751ca045bb2..7f08f3dd26106 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -131,17 +131,13 @@ class Debugger : public std::enable_shared_from_this, void SetAsyncExecution(bool async); - lldb::FileSP GetInputFileSP() { return m_input_file_sp; } - - lldb::StreamFileSP GetOutputStreamSP() { return m_output_stream_sp; } - - lldb::StreamFileSP GetErrorStreamSP() { return m_error_stream_sp; } - File &GetInputFile() { return *m_input_file_sp; } - File &GetOutputFile() { return m_output_stream_sp->GetFile(); } + lldb::FileSP GetInputFileSP() { return m_input_file_sp; } + + lldb::FileSP GetOutputFileSP() { return m_output_stream_sp->GetFileSP(); } - File &GetErrorFile() { return m_error_stream_sp->GetFile(); } + lldb::FileSP GetErrorFileSP() { return m_error_stream_sp->GetFileSP(); } repro::DataRecorder *GetInputRecorder(); @@ -649,6 +645,14 @@ class Debugger : public std::enable_shared_from_this, void PrintProgress(const ProgressEventData &data); + /// Except for Debugger and IOHandler, GetOutputStreamSP and GetErrorStreamSP + /// should not be used directly. Use GetAsyncOutputStream and + /// GetAsyncErrorStream instead. + /// @{ + lldb::StreamFileSP GetOutputStreamSP() { return m_output_stream_sp; } + lldb::StreamFileSP GetErrorStreamSP() { return m_error_stream_sp; } + /// @} + void PushIOHandler(const lldb::IOHandlerSP &reader_sp, bool cancel_top_handler = true); diff --git a/lldb/include/lldb/Core/StreamAsynchronousIO.h b/lldb/include/lldb/Core/StreamAsynchronousIO.h index b7adbc42096ce..7ae65757e2d73 100644 --- a/lldb/include/lldb/Core/StreamAsynchronousIO.h +++ b/lldb/include/lldb/Core/StreamAsynchronousIO.h @@ -18,9 +18,17 @@ namespace lldb_private { class Debugger; +/// A stream meant for asynchronously printing output. Output is buffered until +/// the stream is flushed or destroyed. Printing is handled by the currently +/// active IOHandler, or the debugger's output or error stream if there is none. class StreamAsynchronousIO : public Stream { public: - StreamAsynchronousIO(Debugger &debugger, bool for_stdout, bool colors); + enum ForSTDOUT : bool { + STDOUT = true, + STDERR = false, + }; + + StreamAsynchronousIO(Debugger &debugger, ForSTDOUT for_stdout); ~StreamAsynchronousIO() override; @@ -32,7 +40,7 @@ class StreamAsynchronousIO : public Stream { private: Debugger &m_debugger; std::string m_data; - bool m_for_stdout; + ForSTDOUT m_for_stdout; }; } // namespace lldb_private diff --git a/lldb/include/lldb/Symbol/LineTable.h b/lldb/include/lldb/Symbol/LineTable.h index 6d158ab518879..f66081b6ee110 100644 --- a/lldb/include/lldb/Symbol/LineTable.h +++ b/lldb/include/lldb/Symbol/LineTable.h @@ -102,6 +102,19 @@ class LineTable { void GetDescription(Stream *s, Target *target, lldb::DescriptionLevel level); + /// Helper function for line table iteration. \c lower_bound returns the index + /// of the first line entry which ends after the given address (i.e., the + /// first entry which contains the given address or it comes after it). + /// \c upper_bound returns the index of the first line entry which begins on + /// or after the given address (i.e., the entry which would come after the + /// entry containing the given address, if such an entry exists). Functions + /// return GetSize() if there is no such entry. The functions are + /// most useful in combination: iterating from lower_bound(a) to + /// upper_bound(b) returns all line tables which intersect the half-open + /// range [a,b). + uint32_t lower_bound(const Address &so_addr) const; + uint32_t upper_bound(const Address &so_addr) const; + /// Find a line entry that contains the section offset address \a so_addr. /// /// \param[in] so_addr diff --git a/lldb/include/lldb/Target/ThreadPlanTracer.h b/lldb/include/lldb/Target/ThreadPlanTracer.h index a6fd2f031dc22..7c45e213f94f1 100644 --- a/lldb/include/lldb/Target/ThreadPlanTracer.h +++ b/lldb/include/lldb/Target/ThreadPlanTracer.h @@ -56,7 +56,7 @@ class ThreadPlanTracer { Process &m_process; lldb::tid_t m_tid; - Stream *GetLogStream(); + lldb::StreamSP GetLogStreamSP(); virtual void Log(); diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index bf19d2ff8333c..e646b09e05852 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -509,14 +509,14 @@ SBFile SBDebugger::GetInputFile() { FILE *SBDebugger::GetOutputFileHandle() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return m_opaque_sp->GetOutputStreamSP()->GetFile().GetStream(); + return m_opaque_sp->GetOutputFileSP()->GetStream(); return nullptr; } SBFile SBDebugger::GetOutputFile() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return SBFile(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); + return SBFile(m_opaque_sp->GetOutputFileSP()); return SBFile(); } @@ -524,7 +524,7 @@ FILE *SBDebugger::GetErrorFileHandle() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return m_opaque_sp->GetErrorStreamSP()->GetFile().GetStream(); + return m_opaque_sp->GetErrorFileSP()->GetStream(); return nullptr; } @@ -532,7 +532,7 @@ SBFile SBDebugger::GetErrorFile() { LLDB_INSTRUMENT_VA(this); SBFile file; if (m_opaque_sp) - return SBFile(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); + return SBFile(m_opaque_sp->GetErrorFileSP()); return SBFile(); } @@ -573,8 +573,8 @@ void SBDebugger::HandleCommand(const char *command) { sb_interpreter.HandleCommand(command, result, false); - result.PutError(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); - result.PutOutput(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); + result.PutError(m_opaque_sp->GetErrorFileSP()); + result.PutOutput(m_opaque_sp->GetOutputFileSP()); if (!m_opaque_sp->GetAsyncExecution()) { SBProcess process(GetCommandInterpreter().GetProcess()); diff --git a/lldb/source/Commands/CommandObjectGUI.cpp b/lldb/source/Commands/CommandObjectGUI.cpp index b56e49b073b03..8630171bae9d1 100644 --- a/lldb/source/Commands/CommandObjectGUI.cpp +++ b/lldb/source/Commands/CommandObjectGUI.cpp @@ -28,10 +28,10 @@ void CommandObjectGUI::DoExecute(Args &args, CommandReturnObject &result) { #if LLDB_ENABLE_CURSES Debugger &debugger = GetDebugger(); - File &input = debugger.GetInputFile(); - File &output = debugger.GetOutputFile(); - if (input.GetStream() && output.GetStream() && input.GetIsRealTerminal() && - input.GetIsInteractive()) { + FileSP input_sp = debugger.GetInputFileSP(); + FileSP output_sp = debugger.GetOutputFileSP(); + if (input_sp->GetStream() && output_sp->GetStream() && + input_sp->GetIsRealTerminal() && input_sp->GetIsInteractive()) { IOHandlerSP io_handler_sp(new IOHandlerCursesGUI(debugger)); if (io_handler_sp) debugger.RunIOHandlerAsync(io_handler_sp); diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp index 5fb2dfaab8de0..17efae189b05e 100644 --- a/lldb/source/Commands/CommandObjectLog.cpp +++ b/lldb/source/Commands/CommandObjectLog.cpp @@ -394,7 +394,8 @@ class CommandObjectLogDump : public CommandObjectParsed { (*file)->GetDescriptor(), /*shouldClose=*/true); } else { stream_up = std::make_unique( - GetDebugger().GetOutputFile().GetDescriptor(), /*shouldClose=*/false); + GetDebugger().GetOutputFileSP()->GetDescriptor(), + /*shouldClose=*/false); } const std::string channel = std::string(args[0].ref()); diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 18569e155b517..8b7814d434ee9 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -947,7 +947,7 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) if (term && !strcmp(term, "dumb")) SetUseColor(false); // Turn off use-color if we don't write to a terminal with color support. - if (!GetOutputFile().GetIsTerminalWithColors()) + if (!GetOutputFileSP()->GetIsTerminalWithColors()) SetUseColor(false); if (Diagnostics::Enabled()) { @@ -1321,11 +1321,13 @@ bool Debugger::PopIOHandler(const IOHandlerSP &pop_reader_sp) { } StreamSP Debugger::GetAsyncOutputStream() { - return std::make_shared(*this, true, GetUseColor()); + return std::make_shared(*this, + StreamAsynchronousIO::STDOUT); } StreamSP Debugger::GetAsyncErrorStream() { - return std::make_shared(*this, false, GetUseColor()); + return std::make_shared(*this, + StreamAsynchronousIO::STDERR); } void Debugger::RequestInterrupt() { @@ -1678,7 +1680,7 @@ bool Debugger::EnableLog(llvm::StringRef channel, LLDB_LOG_OPTION_PREPEND_TIMESTAMP | LLDB_LOG_OPTION_PREPEND_THREAD_NAME; } else if (log_file.empty()) { log_handler_sp = - CreateLogHandler(log_handler_kind, GetOutputFile().GetDescriptor(), + CreateLogHandler(log_handler_kind, GetOutputFileSP()->GetDescriptor(), /*should_close=*/false, buffer_size); } else { auto pos = m_stream_handlers.find(log_file); @@ -2111,8 +2113,8 @@ void Debugger::HandleProgressEvent(const lldb::EventSP &event_sp) { // Determine whether the current output file is an interactive terminal with // color support. We assume that if we support ANSI escape codes we support // vt100 escape codes. - File &file = GetOutputFile(); - if (!file.GetIsInteractive() || !file.GetIsTerminalWithColors()) + FileSP file_sp = GetOutputFileSP(); + if (!file_sp->GetIsInteractive() || !file_sp->GetIsTerminalWithColors()) return; StreamSP output = GetAsyncOutputStream(); diff --git a/lldb/source/Core/StreamAsynchronousIO.cpp b/lldb/source/Core/StreamAsynchronousIO.cpp index c2c64b61ab726..dbd56a69675b4 100644 --- a/lldb/source/Core/StreamAsynchronousIO.cpp +++ b/lldb/source/Core/StreamAsynchronousIO.cpp @@ -14,20 +14,20 @@ using namespace lldb; using namespace lldb_private; -StreamAsynchronousIO::StreamAsynchronousIO(Debugger &debugger, bool for_stdout, - bool colors) - : Stream(0, 4, eByteOrderBig, colors), m_debugger(debugger), m_data(), - m_for_stdout(for_stdout) {} +StreamAsynchronousIO::StreamAsynchronousIO( + Debugger &debugger, StreamAsynchronousIO::ForSTDOUT for_stdout) + : Stream(0, 4, eByteOrderBig, debugger.GetUseColor()), m_debugger(debugger), + m_data(), m_for_stdout(for_stdout) {} StreamAsynchronousIO::~StreamAsynchronousIO() { - // Flush when we destroy to make sure we display the data + // Flush when we destroy to make sure we display the data. Flush(); } void StreamAsynchronousIO::Flush() { if (!m_data.empty()) { m_debugger.PrintAsync(m_data.data(), m_data.size(), m_for_stdout); - m_data = std::string(); + m_data.clear(); } } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index acdec84a1689b..5346d5a2d162a 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2837,8 +2837,8 @@ void CommandInterpreter::HandleCommandsFromFile( } if (flags & eHandleCommandFlagPrintResult) { - debugger.GetOutputFile().Printf("Executing commands in '%s'.\n", - cmd_file_path.c_str()); + debugger.GetOutputFileSP()->Printf("Executing commands in '%s'.\n", + cmd_file_path.c_str()); } // Used for inheriting the right settings when "command source" might diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 8d10e5de01225..a392d5777a021 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -245,8 +245,8 @@ ScriptInterpreterIORedirect::ScriptInterpreterIORedirect( if (outfile_handle) ::setbuf(outfile_handle, nullptr); - result->SetImmediateOutputFile(debugger.GetOutputStreamSP()->GetFileSP()); - result->SetImmediateErrorFile(debugger.GetErrorStreamSP()->GetFileSP()); + result->SetImmediateOutputFile(debugger.GetOutputFileSP()); + result->SetImmediateErrorFile(debugger.GetErrorFileSP()); } } diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index cff44b588e26e..1d4cda6c046b7 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -1193,7 +1193,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { m_kext_summary_header.version = data.GetU32(&offset); if (m_kext_summary_header.version > 128) { lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable version number %u\n", m_kext_summary_header.version); @@ -1208,7 +1208,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { // If we get an improbably large entry_size, we're probably // getting bad memory. lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable entry_size %u\n", m_kext_summary_header.entry_size); @@ -1226,7 +1226,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { // If we get an improbably large number of kexts, we're probably // getting bad memory. lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable number of kexts %u\n", m_kext_summary_header.entry_count); @@ -1330,7 +1330,8 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( number_of_old_kexts_being_removed == 0) return true; - lldb::StreamSP s = m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + lldb::StreamSP s = + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); if (load_kexts) { if (number_of_new_kexts_being_added > 0 && number_of_old_kexts_being_removed > 0) { diff --git a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp index 8c2700cf21de9..c2db3540a797b 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp @@ -116,8 +116,6 @@ StructuredData::ObjectSP InstrumentationRuntimeUBSan::RetrieveReportData( if (!frame_sp) return StructuredData::ObjectSP(); - StreamFileSP Stream = target.GetDebugger().GetOutputStreamSP(); - EvaluateExpressionOptions options; options.SetUnwindOnError(true); options.SetTryAllThreads(true); diff --git a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp index 74e0fa7d49f82..d61c59776eee6 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp @@ -210,8 +210,8 @@ bool ReportRetriever::NotifyBreakpointHit(ProcessSP process_sp, InstrumentationRuntimeStopInfo::CreateStopReasonWithInstrumentationData( *thread_sp, description, report)); - if (StreamFileSP stream_sp = StreamFileSP( - process_sp->GetTarget().GetDebugger().GetOutputStreamSP())) + if (StreamSP stream_sp = + process_sp->GetTarget().GetDebugger().GetAsyncOutputStream()) stream_sp->Printf("AddressSanitizer report breakpoint hit. Use 'thread " "info -s' to get extended information about the " "report.\n"); diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index 7e8eee9f5aa4f..6d028e324ee4e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -45,8 +45,8 @@ class IOHandlerLuaInterpreter : public IOHandlerDelegate, m_script_interpreter(script_interpreter), m_active_io_handler(active_io_handler) { llvm::cantFail(m_script_interpreter.GetLua().ChangeIO( - debugger.GetOutputFile().GetStream(), - debugger.GetErrorFile().GetStream())); + debugger.GetOutputFileSP()->GetStream(), + debugger.GetErrorFileSP()->GetStream())); llvm::cantFail(m_script_interpreter.EnterSession(debugger.GetID())); } diff --git a/lldb/source/Symbol/LineTable.cpp b/lldb/source/Symbol/LineTable.cpp index 3d2afcdd11997..aae4ab59ff156 100644 --- a/lldb/source/Symbol/LineTable.cpp +++ b/lldb/source/Symbol/LineTable.cpp @@ -123,7 +123,7 @@ void LineTable::InsertSequence(LineSequence *sequence) { entry_collection::iterator end_pos = m_entries.end(); LineTable::Entry::LessThanBinaryPredicate less_than_bp(this); entry_collection::iterator pos = - upper_bound(begin_pos, end_pos, entry, less_than_bp); + std::upper_bound(begin_pos, end_pos, entry, less_than_bp); // We should never insert a sequence in the middle of another sequence if (pos != begin_pos) { @@ -185,6 +185,48 @@ bool LineTable::GetLineEntryAtIndex(uint32_t idx, LineEntry &line_entry) { return false; } +uint32_t LineTable::lower_bound(const Address &so_addr) const { + if (so_addr.GetModule() != m_comp_unit->GetModule()) + return GetSize(); + + Entry search_entry; + search_entry.file_addr = so_addr.GetFileAddress(); + if (search_entry.file_addr == LLDB_INVALID_ADDRESS) + return GetSize(); + + // This is not a typo. upper_bound returns the first entry which definitely + // does not contain this address, which means the entry before it *might* + // contain it -- if it is not a termination entry. + auto pos = + llvm::upper_bound(m_entries, search_entry, Entry::EntryAddressLessThan); + + if (pos != m_entries.begin() && !std::prev(pos)->is_terminal_entry) + --pos; + + return std::distance(m_entries.begin(), pos); +} + +uint32_t LineTable::upper_bound(const Address &so_addr) const { + if (so_addr.GetModule() != m_comp_unit->GetModule()) + return GetSize(); + + Entry search_entry; + search_entry.file_addr = so_addr.GetFileAddress(); + if (search_entry.file_addr == LLDB_INVALID_ADDRESS) + return GetSize(); + + // This is not a typo. lower_bound returns the first entry which starts on or + // after the given address, which is exactly what we want -- *except* if the + // entry is a termination entry (in that case, we want the one after it). + auto pos = + llvm::lower_bound(m_entries, search_entry, Entry::EntryAddressLessThan); + if (pos != m_entries.end() && pos->file_addr == search_entry.file_addr && + pos->is_terminal_entry) + ++pos; + + return std::distance(m_entries.begin(), pos); +} + bool LineTable::FindLineEntryByAddress(const Address &so_addr, LineEntry &line_entry, uint32_t *index_ptr) { @@ -199,7 +241,7 @@ bool LineTable::FindLineEntryByAddress(const Address &so_addr, if (search_entry.file_addr != LLDB_INVALID_ADDRESS) { entry_collection::const_iterator begin_pos = m_entries.begin(); entry_collection::const_iterator end_pos = m_entries.end(); - entry_collection::const_iterator pos = lower_bound( + entry_collection::const_iterator pos = std::lower_bound( begin_pos, end_pos, search_entry, Entry::EntryAddressLessThan); if (pos != end_pos) { if (pos != begin_pos) { diff --git a/lldb/source/Target/ThreadPlanTracer.cpp b/lldb/source/Target/ThreadPlanTracer.cpp index a119bf8589279..ab63cc7f6c223 100644 --- a/lldb/source/Target/ThreadPlanTracer.cpp +++ b/lldb/source/Target/ThreadPlanTracer.cpp @@ -27,6 +27,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/State.h" +#include "lldb/lldb-forward.h" using namespace lldb; using namespace lldb_private; @@ -41,13 +42,13 @@ ThreadPlanTracer::ThreadPlanTracer(Thread &thread) : m_process(*thread.GetProcess().get()), m_tid(thread.GetID()), m_enabled(false), m_stream_sp(), m_thread(nullptr) {} -Stream *ThreadPlanTracer::GetLogStream() { +StreamSP ThreadPlanTracer::GetLogStreamSP() { if (m_stream_sp) - return m_stream_sp.get(); + return m_stream_sp; else { TargetSP target_sp(GetThread().CalculateTarget()); if (target_sp) - return target_sp->GetDebugger().GetOutputStreamSP().get(); + return target_sp->GetDebugger().GetAsyncOutputStream(); } return nullptr; } @@ -65,12 +66,11 @@ void ThreadPlanTracer::Log() { bool show_frame_index = false; bool show_fullpaths = false; - Stream *stream = GetLogStream(); - if (stream) { - GetThread().GetStackFrameAtIndex(0)->Dump(stream, show_frame_index, + if (StreamSP stream_sp = GetLogStreamSP()) { + GetThread().GetStackFrameAtIndex(0)->Dump(stream_sp.get(), show_frame_index, show_fullpaths); - stream->Printf("\n"); - stream->Flush(); + stream_sp->Printf("\n"); + stream_sp->Flush(); } } @@ -129,9 +129,9 @@ void ThreadPlanAssemblyTracer::TracingStarted() { void ThreadPlanAssemblyTracer::TracingEnded() { m_register_values.clear(); } void ThreadPlanAssemblyTracer::Log() { - Stream *stream = GetLogStream(); + StreamSP stream_sp = GetLogStreamSP(); - if (!stream) + if (!stream_sp) return; RegisterContext *reg_ctx = GetThread().GetRegisterContext().get(); @@ -142,9 +142,10 @@ void ThreadPlanAssemblyTracer::Log() { uint8_t buffer[16] = {0}; // Must be big enough for any single instruction addr_valid = m_process.GetTarget().ResolveLoadAddress(pc, pc_addr); - pc_addr.Dump(stream, &GetThread(), Address::DumpStyleResolvedDescription, + pc_addr.Dump(stream_sp.get(), &GetThread(), + Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); - stream->PutCString(" "); + stream_sp->PutCString(" "); Disassembler *disassembler = GetDisassembler(); if (disassembler) { @@ -175,7 +176,7 @@ void ThreadPlanAssemblyTracer::Log() { instruction_list.GetInstructionAtIndex(0).get(); const FormatEntity::Entry *disassemble_format = m_process.GetTarget().GetDebugger().GetDisassemblyFormat(); - instruction->Dump(stream, max_opcode_byte_size, show_address, + instruction->Dump(stream_sp.get(), max_opcode_byte_size, show_address, show_bytes, show_control_flow_kind, nullptr, nullptr, nullptr, disassemble_format, 0); } @@ -198,12 +199,12 @@ void ThreadPlanAssemblyTracer::Log() { if (abi->GetArgumentValues(GetThread(), value_list)) { for (int arg_index = 0; arg_index < num_args; ++arg_index) { - stream->Printf( + stream_sp->Printf( "\n\targ[%d]=%llx", arg_index, value_list.GetValueAtIndex(arg_index)->GetScalar().ULongLong()); if (arg_index + 1 < num_args) - stream->PutCString(", "); + stream_sp->PutCString(", "); } } } @@ -222,14 +223,14 @@ void ThreadPlanAssemblyTracer::Log() { if (m_register_values[reg_num].GetType() == RegisterValue::eTypeInvalid || reg_value != m_register_values[reg_num]) { if (reg_value.GetType() != RegisterValue::eTypeInvalid) { - stream->PutCString("\n\t"); - DumpRegisterValue(reg_value, *stream, *reg_info, true, false, + stream_sp->PutCString("\n\t"); + DumpRegisterValue(reg_value, *stream_sp, *reg_info, true, false, eFormatDefault); } } m_register_values[reg_num] = reg_value; } } - stream->EOL(); - stream->Flush(); + stream_sp->EOL(); + stream_sp->Flush(); } diff --git a/lldb/unittests/Symbol/CMakeLists.txt b/lldb/unittests/Symbol/CMakeLists.txt index e1d24357e33db..ab5cecd101833 100644 --- a/lldb/unittests/Symbol/CMakeLists.txt +++ b/lldb/unittests/Symbol/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_unittest(SymbolTests JSONSymbolTest.cpp + LineTableTest.cpp LocateSymbolFileTest.cpp MangledTest.cpp PostfixExpressionTest.cpp diff --git a/lldb/unittests/Symbol/LineTableTest.cpp b/lldb/unittests/Symbol/LineTableTest.cpp new file mode 100644 index 0000000000000..2fa2913f67f9e --- /dev/null +++ b/lldb/unittests/Symbol/LineTableTest.cpp @@ -0,0 +1,285 @@ +//===-- LineTableTest.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Plugins/ObjectFile/ELF/ObjectFileELF.h" +#include "TestingSupport/SubsystemRAII.h" +#include "TestingSupport/TestUtilities.h" +#include "lldb/Core/PluginManager.h" +#include "lldb/Symbol/CompileUnit.h" +#include "lldb/Symbol/SymbolFile.h" +#include "gtest/gtest.h" +#include + +using namespace lldb; +using namespace llvm; +using namespace lldb_private; + +namespace { + +// A fake symbol file class to allow us to create the line table "the right +// way". Pretty much all methods except for GetCompileUnitAtIndex and +// GetNumCompileUnits are stubbed out. +class FakeSymbolFile : public SymbolFile { +public: + /// LLVM RTTI support. + /// \{ + bool isA(const void *ClassID) const override { + return ClassID == &ID || SymbolFile::isA(ClassID); + } + static bool classof(const SymbolFile *obj) { return obj->isA(&ID); } + /// \} + + static void Initialize() { + PluginManager::RegisterPlugin("FakeSymbolFile", "", CreateInstance, + DebuggerInitialize); + } + static void Terminate() { PluginManager::UnregisterPlugin(CreateInstance); } + + void InjectCompileUnit(std::unique_ptr cu_up) { + m_cu_sp = std::move(cu_up); + } + +private: + /// LLVM RTTI support. + static char ID; + + static SymbolFile *CreateInstance(ObjectFileSP objfile_sp) { + return new FakeSymbolFile(std::move(objfile_sp)); + } + static void DebuggerInitialize(Debugger &) {} + + StringRef GetPluginName() override { return "FakeSymbolFile"; } + uint32_t GetAbilities() override { return UINT32_MAX; } + uint32_t CalculateAbilities() override { return UINT32_MAX; } + uint32_t GetNumCompileUnits() override { return 1; } + CompUnitSP GetCompileUnitAtIndex(uint32_t) override { return m_cu_sp; } + Symtab *GetSymtab() override { return nullptr; } + LanguageType ParseLanguage(CompileUnit &) override { return eLanguageTypeC; } + size_t ParseFunctions(CompileUnit &) override { return 0; } + bool ParseLineTable(CompileUnit &) override { return true; } + bool ParseDebugMacros(CompileUnit &) override { return true; } + bool ParseSupportFiles(CompileUnit &, SupportFileList &) override { + return true; + } + size_t ParseTypes(CompileUnit &) override { return 0; } + bool ParseImportedModules(const SymbolContext &, + std::vector &) override { + return false; + } + size_t ParseBlocksRecursive(Function &) override { return 0; } + size_t ParseVariablesForContext(const SymbolContext &) override { return 0; } + Type *ResolveTypeUID(user_id_t) override { return nullptr; } + std::optional + GetDynamicArrayInfoForUID(user_id_t, const ExecutionContext *) override { + return std::nullopt; + } + bool CompleteType(CompilerType &) override { return true; } + uint32_t ResolveSymbolContext(const Address &, SymbolContextItem, + SymbolContext &) override { + return 0; + } + void GetTypes(SymbolContextScope *, TypeClass, TypeList &) override {} + Expected GetTypeSystemForLanguage(LanguageType) override { + return createStringError(std::errc::not_supported, ""); + } + const ObjectFile *GetObjectFile() const override { + return m_objfile_sp.get(); + } + ObjectFile *GetObjectFile() override { return m_objfile_sp.get(); } + ObjectFile *GetMainObjectFile() override { return m_objfile_sp.get(); } + void SectionFileAddressesChanged() override {} + void Dump(Stream &) override {} + uint64_t GetDebugInfoSize(bool) override { return 0; } + bool GetDebugInfoIndexWasLoadedFromCache() const override { return false; } + void SetDebugInfoIndexWasLoadedFromCache() override {} + bool GetDebugInfoIndexWasSavedToCache() const override { return false; } + void SetDebugInfoIndexWasSavedToCache() override {} + bool GetDebugInfoHadFrameVariableErrors() const override { return false; } + void SetDebugInfoHadFrameVariableErrors() override {} + TypeSP MakeType(user_id_t, ConstString, std::optional, + SymbolContextScope *, user_id_t, Type::EncodingDataType, + const Declaration &, const CompilerType &, Type::ResolveState, + uint32_t) override { + return nullptr; + } + TypeSP CopyType(const TypeSP &) override { return nullptr; } + + FakeSymbolFile(ObjectFileSP objfile_sp) + : m_objfile_sp(std::move(objfile_sp)) {} + + ObjectFileSP m_objfile_sp; + CompUnitSP m_cu_sp; +}; + +struct FakeModuleFixture { + TestFile file; + ModuleSP module_sp; + SectionSP text_sp; + LineTable *line_table; +}; + +class LineTableTest : public testing::Test { + SubsystemRAII subsystems; +}; + +class LineSequenceBuilder { +public: + std::vector> Build() { + return std::move(m_sequences); + } + enum Terminal : bool { Terminal = true }; + void Entry(addr_t addr, bool terminal = false) { + LineTable::AppendLineEntryToSequence( + m_seq_up.get(), addr, /*line=*/1, /*column=*/0, + /*file_idx=*/0, + /*is_start_of_statement=*/false, /*is_start_of_basic_block=*/false, + /*is_prologue_end=*/false, /*is_epilogue_begin=*/false, terminal); + if (terminal) { + m_sequences.push_back(std::move(m_seq_up)); + m_seq_up = LineTable::CreateLineSequenceContainer(); + } + } + +private: + std::vector> m_sequences; + std::unique_ptr m_seq_up = + LineTable::CreateLineSequenceContainer(); +}; + +} // namespace + +char FakeSymbolFile::ID; + +static llvm::Expected +CreateFakeModule(std::vector> line_sequences) { + Expected file = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_386 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x0010 + Address: 0x0000 + Size: 0x1000 +)"); + if (!file) + return file.takeError(); + + auto module_sp = std::make_shared(file->moduleSpec()); + SectionSP text_sp = + module_sp->GetSectionList()->FindSectionByName(ConstString(".text")); + if (!text_sp) + return createStringError("No .text"); + + auto cu_up = std::make_unique(module_sp, /*user_data=*/nullptr, + /*support_file_sp=*/nullptr, + /*uid=*/0, eLanguageTypeC, + /*is_optimized=*/eLazyBoolNo); + LineTable *line_table = new LineTable(cu_up.get(), std::move(line_sequences)); + cu_up->SetLineTable(line_table); + cast(module_sp->GetSymbolFile()) + ->InjectCompileUnit(std::move(cu_up)); + + return FakeModuleFixture{std::move(*file), std::move(module_sp), + std::move(text_sp), line_table}; +} + +TEST_F(LineTableTest, LowerAndUpperBound) { + LineSequenceBuilder builder; + builder.Entry(0); + builder.Entry(10); + builder.Entry(20, LineSequenceBuilder::Terminal); + builder.Entry(20); // Starts right after the previous sequence. + builder.Entry(30, LineSequenceBuilder::Terminal); + builder.Entry(40); // Gap after the previous sequence. + builder.Entry(50, LineSequenceBuilder::Terminal); + + llvm::Expected fixture = CreateFakeModule(builder.Build()); + ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded()); + + LineTable *table = fixture->line_table; + + auto make_addr = [&](addr_t addr) { return Address(fixture->text_sp, addr); }; + + // Both functions return the same value for boundary values. This way the + // index range for e.g. [0,10) is [0,1). + EXPECT_EQ(table->lower_bound(make_addr(0)), 0u); + EXPECT_EQ(table->upper_bound(make_addr(0)), 0u); + EXPECT_EQ(table->lower_bound(make_addr(10)), 1u); + EXPECT_EQ(table->upper_bound(make_addr(10)), 1u); + EXPECT_EQ(table->lower_bound(make_addr(20)), 3u); + EXPECT_EQ(table->upper_bound(make_addr(20)), 3u); + + // In case there's no "real" entry at this address, they return the first real + // entry. + EXPECT_EQ(table->lower_bound(make_addr(30)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(30)), 5u); + + EXPECT_EQ(table->lower_bound(make_addr(40)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(40)), 5u); + + // For in-between values, their result differs by one. [9,19) maps to [0,2) + // because the first two entries contain a part of that range. + EXPECT_EQ(table->lower_bound(make_addr(9)), 0u); + EXPECT_EQ(table->upper_bound(make_addr(9)), 1u); + EXPECT_EQ(table->lower_bound(make_addr(19)), 1u); + EXPECT_EQ(table->upper_bound(make_addr(19)), 2u); + EXPECT_EQ(table->lower_bound(make_addr(29)), 3u); + EXPECT_EQ(table->upper_bound(make_addr(29)), 4u); + + // In a gap, they both return the first entry after the gap. + EXPECT_EQ(table->upper_bound(make_addr(39)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(39)), 5u); + + // And if there's no such entry, they return the size of the list. + EXPECT_EQ(table->lower_bound(make_addr(50)), table->GetSize()); + EXPECT_EQ(table->upper_bound(make_addr(50)), table->GetSize()); + EXPECT_EQ(table->lower_bound(make_addr(59)), table->GetSize()); + EXPECT_EQ(table->upper_bound(make_addr(59)), table->GetSize()); +} + +TEST_F(LineTableTest, FindLineEntryByAddress) { + LineSequenceBuilder builder; + builder.Entry(0); + builder.Entry(10); + builder.Entry(20, LineSequenceBuilder::Terminal); + builder.Entry(20); // Starts right after the previous sequence. + builder.Entry(30, LineSequenceBuilder::Terminal); + builder.Entry(40); // Gap after the previous sequence. + builder.Entry(50, LineSequenceBuilder::Terminal); + + llvm::Expected fixture = CreateFakeModule(builder.Build()); + ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded()); + + LineTable *table = fixture->line_table; + + auto find = [&](addr_t addr) -> std::tuple { + LineEntry entry; + if (!table->FindLineEntryByAddress(Address(fixture->text_sp, addr), entry)) + return {LLDB_INVALID_ADDRESS, LLDB_INVALID_ADDRESS, false}; + return {entry.range.GetBaseAddress().GetFileAddress(), + entry.range.GetByteSize(), + static_cast(entry.is_terminal_entry)}; + }; + + EXPECT_THAT(find(0), testing::FieldsAre(0, 10, false)); + EXPECT_THAT(find(9), testing::FieldsAre(0, 10, false)); + EXPECT_THAT(find(10), testing::FieldsAre(10, 10, false)); + EXPECT_THAT(find(19), testing::FieldsAre(10, 10, false)); + EXPECT_THAT(find(20), testing::FieldsAre(20, 10, false)); + EXPECT_THAT(find(30), testing::FieldsAre(LLDB_INVALID_ADDRESS, + LLDB_INVALID_ADDRESS, false)); + EXPECT_THAT(find(40), testing::FieldsAre(40, 10, false)); + EXPECT_THAT(find(50), testing::FieldsAre(LLDB_INVALID_ADDRESS, + LLDB_INVALID_ADDRESS, false)); +} diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst index 90797499dec22..7603bcc95383b 100644 --- a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst +++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst @@ -6,7 +6,7 @@ ************************************************** ==================================================================================== -Syntax of gfx940 Instructions +Syntax of gfx942 Instructions ==================================================================================== .. contents:: @@ -15,7 +15,7 @@ Syntax of gfx940 Instructions Introduction ============ -This document describes the syntax of gfx940 instructions. +This document describes the syntax of gfx942 instructions. Notation ======== diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 5966d1617feee..d580be1eb8cfc 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ ----------------------------------------------------------------------------------------------------------------------- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa* Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: - - kernarg preload - Packed - work-item Add product - IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: - - kernarg preload - Packed - work-item Add product - IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``v_dot2_f32_f16`` - ``gfx9-4-generic`` ``amdgcn`` - ``gfx940`` - sramecc - Architected FP8 and BF8 instructions, - - ``gfx941`` - tgsplit flat scratch FP8 and BF8 conversion - - ``gfx942`` - xnack - Packed instructions, as well as - - ``gfx950`` - kernarg preload work-item instructions with XF32 format + ``gfx9-4-generic`` ``amdgcn`` - ``gfx942`` - sramecc - Architected FP8 and BF8 instructions, + - ``gfx950`` - tgsplit flat scratch FP8 and BF8 conversion + - xnack - Packed instructions, as well as + - kernarg preload work-item instructions with XF32 format IDs support are not available. ``gfx10-1-generic`` ``amdgcn`` - ``gfx1010`` - xnack - Absolute flat - The following instructions are @@ -2232,7 +2218,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX1035`` 0x03d ``gfx1035`` ``EF_AMDGPU_MACH_AMDGCN_GFX1034`` 0x03e ``gfx1034`` ``EF_AMDGPU_MACH_AMDGCN_GFX90A`` 0x03f ``gfx90a`` - ``EF_AMDGPU_MACH_AMDGCN_GFX940`` 0x040 ``gfx940`` + *reserved* 0x040 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1100`` 0x041 ``gfx1100`` ``EF_AMDGPU_MACH_AMDGCN_GFX1013`` 0x042 ``gfx1013`` ``EF_AMDGPU_MACH_AMDGCN_GFX1150`` 0x043 ``gfx1150`` @@ -2243,7 +2229,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX1200`` 0x048 ``gfx1200`` *reserved* 0x049 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1151`` 0x04a ``gfx1151`` - ``EF_AMDGPU_MACH_AMDGCN_GFX941`` 0x04b ``gfx941`` + *reserved* 0x04b Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942`` *reserved* 0x04d Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201`` @@ -4985,7 +4971,7 @@ The fields used by CP for code objects before V3 also match those specified in bytes 383:352 4 bytes COMPUTE_PGM_RSRC3 GFX6-GFX9 Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 Compute Shader (CS) program settings used by CP to set up @@ -5070,7 +5056,7 @@ The fields used by CP for code objects before V3 also match those specified in 463:460 4 bits Reserved, must be 0. 470:464 7 bits KERNARG_PRELOAD_SPEC_LENGTH GFX6-GFX9 - Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 - The number of dwords from the kernarg segment to preload into User SGPRs before kernel @@ -5078,7 +5064,7 @@ The fields used by CP for code objects before V3 also match those specified in :ref:`amdgpu-amdhsa-kernarg-preload`). 479:471 9 bits KERNARG_PRELOAD_SPEC_OFFSET GFX6-GFX9 - Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 - An offset in dwords into the kernarg segment to begin preloading data into User @@ -5104,7 +5090,7 @@ The fields used by CP for code objects before V3 also match those specified in GFX6-GFX9 - vgprs_used 0..256 - max(0, ceil(vgprs_used / 4) - 1) - GFX90A, GFX940 + GFX90A, GFX942 - vgprs_used 0..512 - vgprs_used = align(arch_vgprs, 4) + acc_vgprs @@ -5570,7 +5556,7 @@ The fields used by CP for code objects before V3 also match those specified in .. - .. table:: compute_pgm_rsrc3 for GFX90A, GFX940 + .. table:: compute_pgm_rsrc3 for GFX90A, GFX942 :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table ======= ======= =============================== =========================================================================== @@ -9981,15 +9967,15 @@ only accessed by a single thread, and is always write-before-read, there is never a need to invalidate these entries from the L1 cache. Hence all cache invalidates are done as ``*_vol`` to only invalidate the volatile cache lines. -The code sequences used to implement the memory model for GFX940, GFX941, GFX942 -are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx941-gfx942-table`. +The code sequences used to implement the memory model for GFX942 are defined in +table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx942-table`. - .. table:: AMDHSA Memory Model Code Sequences GFX940, GFX941, GFX942 - :name: amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx941-gfx942-table + .. table:: AMDHSA Memory Model Code Sequences GFX942 + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx942-table ============ ============ ============== ========== ================================ LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code - Ordering Sync Scope Address GFX940, GFX941, GFX942 + Ordering Sync Scope Address GFX942 Space ============ ============ ============== ========== ================================ **Non-Atomic** @@ -10024,18 +10010,12 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 load *none* *none* - local 1. ds_load store *none* *none* - global - !volatile & !nontemporal - generic - - private 1. GFX940, GFX941 + - private 1. GFX942 - constant buffer/global/flat_store - sc0=1 sc1=1 - GFX942 - buffer/global/flat_store - !volatile & nontemporal - 1. GFX940, GFX941 - buffer/global/flat_store - nt=1 sc0=1 sc1=1 - GFX942 + 1. GFX942 buffer/global/flat_store nt=1 @@ -10707,11 +10687,8 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 **Release Atomic** ------------------------------------------------------------------------------------ - store atomic release - singlethread - global 1. GFX940, GFX941 + store atomic release - singlethread - global 1. GFX942 - wavefront - generic buffer/global/flat_store - sc0=1 sc1=1 - GFX942 - buffer/global/flat_store store atomic release - singlethread - local *If TgSplit execution mode, - wavefront local address space cannot @@ -10749,10 +10726,7 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 store that is being released. - 2. GFX940, GFX941 - buffer/global/flat_store - sc0=1 sc1=1 - GFX942 + 2. GFX942 buffer/global/flat_store sc0=1 store atomic release - workgroup - local *If TgSplit execution mode, @@ -10813,10 +10787,7 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 store that is being released. - 3. GFX940, GFX941 - buffer/global/flat_store - sc0=1 sc1=1 - GFX942 + 3. GFX942 buffer/global/flat_store sc1=1 store atomic release - system - global 1. buffer_wbl2 sc0=1 sc1=1 @@ -17574,11 +17545,7 @@ in this description. CDNA 2 :doc:`GFX9` :doc:`gfx90a` - CDNA 3 :doc:`GFX9` :doc:`gfx940` - - :doc:`gfx941` - - :doc:`gfx942` + CDNA 3 :doc:`GFX9` :doc:`gfx942` RDNA 1 :doc:`GFX10 RDNA1` :doc:`gfx1010` @@ -17616,7 +17583,7 @@ combinations of operands, refer to one of instruction set architecture manuals [AMD-GCN-GFX6]_, [AMD-GCN-GFX7]_, [AMD-GCN-GFX8]_, [AMD-GCN-GFX900-GFX904-VEGA]_, [AMD-GCN-GFX906-VEGA7NM]_, [AMD-GCN-GFX908-CDNA1]_, [AMD-GCN-GFX90A-CDNA2]_, -[AMD-GCN-GFX940-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, +[AMD-GCN-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, [AMD-GCN-GFX11-RDNA3]_ and [AMD-GCN-GFX11-RDNA3.5]_. Operands @@ -18129,7 +18096,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table` ``.amdhsa_user_sgpr_private_segment_buffer`` 0 GFX6-GFX10 Controls ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER in (except :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - GFX940) + GFX942) ``.amdhsa_user_sgpr_dispatch_ptr`` 0 GFX6-GFX12 Controls ENABLE_SGPR_DISPATCH_PTR in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_queue_ptr`` 0 GFX6-GFX12 Controls ENABLE_SGPR_QUEUE_PTR in @@ -18140,7 +18107,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_flat_scratch_init`` 0 GFX6-GFX10 Controls ENABLE_SGPR_FLAT_SCRATCH_INIT in (except :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - GFX940) + GFX942) ``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in @@ -18151,8 +18118,8 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_system_sgpr_private_segment_wavefront_offset`` 0 GFX6-GFX10 Controls ENABLE_PRIVATE_SEGMENT in (except :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. - GFX940) - ``.amdhsa_enable_private_segment`` 0 GFX940, Controls ENABLE_PRIVATE_SEGMENT in + GFX942) + ``.amdhsa_enable_private_segment`` 0 GFX942, Controls ENABLE_PRIVATE_SEGMENT in GFX11-GFX12 :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_system_sgpr_workgroup_id_x`` 1 GFX6-GFX12 Controls ENABLE_SGPR_WORKGROUP_ID_X in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. @@ -18173,14 +18140,14 @@ terminated by an ``.end_amdhsa_kernel`` directive. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_accum_offset`` Required GFX90A, Offset of a first AccVGPR in the unified register file. - GFX940 Used to calculate ACCUM_OFFSET in + GFX942 Used to calculate ACCUM_OFFSET in :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. ``.amdhsa_reserve_vcc`` 1 GFX6-GFX12 Whether the kernel may use the special VCC SGPR. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_reserve_flat_scratch`` 1 GFX7-GFX10 Whether the kernel may use flat instructions to access (except scratch memory. Used to calculate - GFX940) GRANULATED_WAVEFRONT_SGPR_COUNT in + GFX942) GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_reserve_xnack_mask`` Target GFX8-GFX10 Whether the kernel may trigger XNACK replay. Feature Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in @@ -18211,7 +18178,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. ``.amdhsa_fp16_overflow`` 0 GFX9-GFX12 Controls FP16_OVFL in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_tg_split`` Target GFX90A, Controls TG_SPLIT in - Feature GFX940, :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. + Feature GFX942, :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. Specific GFX11-GFX12 (tgsplit) ``.amdhsa_workgroup_processor_mode`` Target GFX10-GFX12 Controls ENABLE_WGP_MODE in @@ -18242,9 +18209,9 @@ terminated by an ``.end_amdhsa_kernel`` directive. ``.amdhsa_exception_int_div_zero`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_user_sgpr_kernarg_preload_length`` 0 GFX90A, Controls KERNARG_PRELOAD_SPEC_LENGTH in - GFX940 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. + GFX942 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_kernarg_preload_offset`` 0 GFX90A, Controls KERNARG_PRELOAD_SPEC_OFFSET in - GFX940 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. + GFX942 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ======================================================== =================== ============ =================== .amdgpu_metadata @@ -18414,7 +18381,7 @@ Additional Documentation .. [AMD-GCN-GFX906-VEGA7NM] `AMD Vega 7nm Instruction Set Architecture `__ .. [AMD-GCN-GFX908-CDNA1] `AMD Instinct MI100 Instruction Set Architecture `__ .. [AMD-GCN-GFX90A-CDNA2] `AMD Instinct MI200 Instruction Set Architecture `__ -.. [AMD-GCN-GFX940-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture `__ +.. [AMD-GCN-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture `__ .. [AMD-GCN-GFX10-RDNA1] `AMD RDNA 1.0 Instruction Set Architecture `__ .. [AMD-GCN-GFX10-RDNA2] `AMD RDNA 2 Instruction Set Architecture `__ .. [AMD-GCN-GFX11-RDNA3] `AMD RDNA 3 Instruction Set Architecture `__ diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 8550af456e961..675b458c41e7b 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1183,6 +1183,93 @@ operations. For more information, refer to the PTX ISA ``_. +'``llvm.nvvm.tcgen05.shift``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.shift.{cg1/cg2}``' intrinsics correspond to +the ``tcgen05.shift.{cg1/cg2}`` PTX instructions. The ``tcgen05.shift`` +is an asynchronous instruction which initiates the shifting of 32-byte +elements downwards across all the rows, except the last, by one row. +The address operand ``%tmem_addr`` specifies the base address of the +matrix in the Tensor Memory whose rows must be down shifted. + +For more information, refer to the PTX ISA +``_. + +'``llvm.nvvm.tcgen05.cp``' +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.cp.4x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.cp.{shape}.{src_fmt}.{cg1/cg2}``' intrinsics +correspond to the ``tcgen05.cp.*`` family of PTX instructions. +The ``tcgen05.cp`` instruction initiates an asynchronous copy operation from +shared memory to the location specified by ``%tmem_addr`` in Tensor Memory. +The 64-bit register operand ``%sdesc`` is the matrix descriptor representing +the source matrix in shared memory that needs to be copied. + +The valid shapes for the copy operation are: +{128x256b, 4x256b, 128x128b, 64x128b_warpx2_02_13, 64x128b_warpx2_01_23, 32x128b_warpx4}. + +Shapes ``64x128b`` and ``32x128b`` require dedicated multicast qualifiers, +which are appended to the corresponding intrinsic names. + +Optionally, the data can be decompressed from the source format in the shared memory +to the destination format in Tensor Memory during the copy operation. Currently, +only ``.b8x16`` is supported as destination format. The valid source formats are +``.b6x16_p32`` and ``.b4x16_p64``. + +When the source format is ``.b6x16_p32``, a contiguous set of 16 elements of 6-bits +each followed by four bytes of padding (``_p32``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +When the source format is ``.b4x16_p64``, a contiguous set of 16 elements of 4-bits +each followed by eight bytes of padding (``_p64``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +For more information on the decompression schemes, refer to the PTX ISA +``_. + +For more information on the tcgen05.cp instruction, refer to the PTX ISA +``_. Other Intrinsics ---------------- diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h index 573df8833bd46..06a00d9ae7899 100644 --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -14,13 +14,11 @@ #define LLVM_ANALYSIS_CAPTURETRACKING_H #include "llvm/ADT/DenseMap.h" -#include "llvm/Support/ModRef.h" namespace llvm { class Value; class Use; - class CaptureInfo; class DataLayout; class Instruction; class DominatorTree; @@ -79,47 +77,10 @@ namespace llvm { const DominatorTree &DT, unsigned MaxUsesToExplore = 0); - /// Capture information for a specific Use. - struct UseCaptureInfo { - /// Components captured by this use. - CaptureComponents UseCC; - /// Components captured by the return value of the user of this Use. - CaptureComponents ResultCC; - - UseCaptureInfo(CaptureComponents UseCC, - CaptureComponents ResultCC = CaptureComponents::None) - : UseCC(UseCC), ResultCC(ResultCC) {} - - static UseCaptureInfo passthrough() { - return UseCaptureInfo(CaptureComponents::None, CaptureComponents::All); - } - - bool isPassthrough() const { - return capturesNothing(UseCC) && capturesAnything(ResultCC); - } - - operator CaptureComponents() const { return UseCC | ResultCC; } - }; - /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. struct CaptureTracker { - /// Action returned from captures(). - enum Action { - /// Stop the traversal. - Stop, - /// Continue traversal, and also follow the return value of the user if - /// it has additional capture components (that is, if it has capture - /// components in Ret that are not part of Other). - Continue, - /// Continue traversal, but do not follow the return value of the user, - /// even if it has additional capture components. Should only be used if - /// captures() has already taken the potential return captures into - /// account. - ContinueIgnoringReturn, - }; - virtual ~CaptureTracker(); /// tooManyUses - The depth of traversal has breached a limit. There may be @@ -133,12 +94,10 @@ namespace llvm { /// U->getUser() is always an Instruction. virtual bool shouldExplore(const Use *U); - /// Use U directly captures CI.UseCC and additionally CI.ResultCC - /// through the return value of the user of U. - /// - /// Return one of Stop, Continue or ContinueIgnoringReturn to control - /// further traversal. - virtual Action captured(const Use *U, UseCaptureInfo CI) = 0; + /// captured - Information about the pointer was captured by the user of + /// use U. Return true to stop the traversal or false to continue looking + /// for more capturing instructions. + virtual bool captured(const Use *U) = 0; /// isDereferenceableOrNull - Overload to allow clients with additional /// knowledge about pointer dereferenceability to provide it and thereby @@ -146,18 +105,21 @@ namespace llvm { virtual bool isDereferenceableOrNull(Value *O, const DataLayout &DL); }; + /// Types of use capture kinds, see \p DetermineUseCaptureKind. + enum class UseCaptureKind { + NO_CAPTURE, + MAY_CAPTURE, + PASSTHROUGH, + }; + /// Determine what kind of capture behaviour \p U may exhibit. /// - /// The returned UseCaptureInfo contains the components captured directly - /// by the use (UseCC) and the components captured through the return value - /// of the user (ResultCC). - /// - /// \p Base is the starting value of the capture analysis, which is - /// relevant for address_is_null captures. + /// A use can be no-capture, a use can potentially capture, or a use can be + /// passthrough such that the uses of the user or \p U should be inspected. /// The \p IsDereferenceableOrNull callback is used to rule out capturing for /// certain comparisons. - UseCaptureInfo - DetermineUseCaptureKind(const Use &U, const Value *Base, + UseCaptureKind + DetermineUseCaptureKind(const Use &U, llvm::function_ref IsDereferenceableOrNull); diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index bd5a796c0b31c..2f6b248055826 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -157,6 +157,20 @@ enum class RootElementFlag : uint32_t { #include "DXContainerConstants.def" }; +#define ROOT_PARAMETER(Val, Enum) Enum = Val, +enum class RootParameterType : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getRootParameterTypes(); + +#define SHADER_VISIBILITY(Val, Enum) Enum = Val, +enum class ShaderVisibility : uint32_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getShaderVisibility(); + PartType parsePartType(StringRef S); struct VertexPSVInfo { @@ -546,6 +560,49 @@ struct ProgramSignatureElement { static_assert(sizeof(ProgramSignatureElement) == 32, "ProgramSignatureElement is misaligned"); +struct RootConstants { + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; + + void swapBytes() { + sys::swapByteOrder(ShaderRegister); + sys::swapByteOrder(RegisterSpace); + sys::swapByteOrder(Num32BitValues); + } +}; + +struct RootParameter { + dxbc::RootParameterType ParameterType; + union { + dxbc::RootConstants Constants; + }; + dxbc::ShaderVisibility ShaderVisibility; + + void swapBytes() { + sys::swapByteOrder(ShaderVisibility); + switch (ParameterType) { + case RootParameterType::Constants32Bit: + Constants.swapBytes(); + break; + case RootParameterType::Empty: + llvm_unreachable("invalid value for ParameterType"); + break; + } + sys::swapByteOrder(ParameterType); + } +}; + +struct RootSignatureHeader { + uint32_t Version; + uint32_t Flags; + + void swapBytes() { + sys::swapByteOrder(Version); + sys::swapByteOrder(Flags); + } +}; + struct RootSignatureValidations { static bool isValidRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } @@ -553,6 +610,16 @@ struct RootSignatureValidations { static bool isValidVersion(uint32_t Version) { return (Version == 1 || Version == 2); } + + static bool isValidParameterType(dxbc::RootParameterType Type) { + // RootParameterType::Empty is the higest value in the enum. + return Type < dxbc::RootParameterType::Empty; + } + + static bool isValidShaderVisibility(dxbc::ShaderVisibility Visibility) { + // ShaderVisibilityFlag::Empty is the higest value in the enum. + return Visibility < dxbc::ShaderVisibility::Empty; + } }; } // namespace dxbc diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 6d44ea14df444..1bf3aa6096c1c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -72,6 +72,26 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) #undef ROOT_ELEMENT_FLAG #endif // ROOT_ELEMENT_FLAG +#ifdef ROOT_PARAMETER + +ROOT_PARAMETER(1, Constants32Bit) +ROOT_PARAMETER(5, Empty) +#undef ROOT_PARAMETER +#endif // ROOT_PARAMETER + +#ifdef SHADER_VISIBILITY + +SHADER_VISIBILITY(0, All) +SHADER_VISIBILITY(1, Vertex) +SHADER_VISIBILITY(2, Hull) +SHADER_VISIBILITY(3, Domain) +SHADER_VISIBILITY(4, Geometry) +SHADER_VISIBILITY(5, Pixel) +SHADER_VISIBILITY(6, Amplification) +SHADER_VISIBILITY(7, Mesh) +SHADER_VISIBILITY(8, Empty) +#undef SHADER_VISIBILITY +#endif // SHADER_VISIBILITY #ifdef DXIL_MODULE_FLAG diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 64f643749d6ac..37eab89e706db 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -814,7 +814,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, - EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040, EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, @@ -825,7 +825,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, - EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4B = 0x04b, EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 4e18f5cc913a7..50eff989feda0 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -425,7 +425,8 @@ class LegalizerHelper { LegalizeResult lowerThreewayCompare(MachineInstr &MI); LegalizeResult lowerMinMax(MachineInstr &MI); LegalizeResult lowerFCopySign(MachineInstr &MI); - LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI); + LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI, + bool ShouldCanonicalize = true); LegalizeResult lowerFMad(MachineInstr &MI); LegalizeResult lowerIntrinsicRound(MachineInstr &MI); LegalizeResult lowerFFloor(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h index 8a0bf3dc71ad2..ad05368bea6a4 100644 --- a/llvm/include/llvm/CodeGen/Register.h +++ b/llvm/include/llvm/CodeGen/Register.h @@ -42,11 +42,12 @@ class Register { /// /// FIXME: remove in favor of member. static constexpr bool isStackSlot(unsigned Reg) { - return MCRegister::isStackSlot(Reg); + return MCRegister::FirstStackSlot <= Reg && + Reg < MCRegister::VirtualRegFlag; } /// Return true if this is a stack slot. - constexpr bool isStack() const { return MCRegister::isStackSlot(Reg); } + constexpr bool isStack() const { return isStackSlot(Reg); } /// Compute the frame index from a register value representing a stack slot. static int stackSlot2Index(Register Reg) { diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index cf8e4a3d2513b..aa0dfbe666cde 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -873,7 +873,7 @@ class SelectionDAG { /// for integers, a type wider than) VT's element type. SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) { // VerifySDNode (via InsertNode) checks BUILD_VECTOR later. - if (Op.getOpcode() == ISD::UNDEF) { + if (Op.isUndef()) { assert((VT.getVectorElementType() == Op.getValueType() || (VT.isInteger() && VT.getVectorElementType().bitsLE(Op.getValueType()))) && @@ -889,7 +889,7 @@ class SelectionDAG { // Return a splat ISD::SPLAT_VECTOR node, consisting of Op splatted to all // elements. SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op) { - if (Op.getOpcode() == ISD::UNDEF) { + if (Op.isUndef()) { assert((VT.getVectorElementType() == Op.getValueType() || (VT.isInteger() && VT.getVectorElementType().bitsLE(Op.getValueType()))) && diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index a4c3d042fe3a4..7ec945d3a0108 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5314,7 +5314,8 @@ class TargetLowering : public TargetLoweringBase { SelectionDAG &DAG) const; /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. - SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; + SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG, + bool ShouldCanonicalize = true) const; /// Expand fminimum/fmaximum into multiple comparison with selects. SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const; diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 8e47e3c7b3a7c..90fe864d4ae71 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1692,11 +1692,6 @@ class CallBase : public Instruction { return capturesNothing(getCaptureInfo(OpNo)); } - /// Returns whether the call has an argument that has an attribute like - /// captures(ret: address, provenance), where the return capture components - /// are not a subset of the other capture components. - bool hasArgumentWithAdditionalReturnCaptureComponents() const; - /// Determine whether this argument is passed by value. bool isByValArgument(unsigned ArgNo) const { return paramHasAttr(ArgNo, Attribute::ByVal); diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9558f2b9b74e0..1e4f25c642493 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1074,7 +1074,7 @@ class AMDGPUImageDimIntrinsic.DmaskArgIndex>>]), @@ -1321,7 +1321,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // Note: volatile bit is **not** permitted here. @@ -1351,7 +1351,7 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1381,7 +1381,7 @@ class AMDGPURawPtrBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1413,7 +1413,7 @@ class AMDGPUStructBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1431,7 +1431,7 @@ class AMDGPUStructAtomicBufferLoad : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1448,7 +1448,7 @@ class AMDGPUStructPtrBufferLoad : DefaultAttrsIn llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1467,7 +1467,7 @@ class AMDGPUStructPtrAtomicBufferLoad : Intrinsi llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1485,7 +1485,7 @@ class AMDGPURawBufferStore : DefaultAttrsIntrins llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1503,7 +1503,7 @@ class AMDGPURawPtrBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1523,7 +1523,7 @@ class AMDGPUStructBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1542,7 +1542,7 @@ class AMDGPUStructPtrBufferStore : DefaultAttrsI llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1628,7 +1628,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; -// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx950, gfx12+. +// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx942, gfx950, gfx12+. def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic; class AMDGPUStructBufferAtomic : Intrinsic < @@ -1727,7 +1727,7 @@ def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz [IntrReadMem, @@ -1743,7 +1743,7 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1761,7 +1761,7 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1779,7 +1779,7 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1797,7 +1797,7 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1815,7 +1815,7 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1834,7 +1834,7 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1853,7 +1853,7 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1872,7 +1872,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1891,7 +1891,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1914,7 +1914,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1934,7 +1934,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -3007,7 +3007,7 @@ def int_amdgcn_fdot2_f32_bf16 : // f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + c // TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces -// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these. +// v_dot2c_f32_f16 on gfx942. Maybe we can consolidate these. def int_amdgcn_fdot2c_f32_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">, @@ -3250,7 +3250,7 @@ def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic; -// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. +// Note: in gfx942 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. // Three bits corresponding to the neg modifier applied to the respective // source operand. def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic; @@ -3258,7 +3258,7 @@ def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic : diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7ef270f3256a6..c32bf0318b5d6 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -55,6 +55,14 @@ def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr // MISC // +// Helper class that concatenates list elements with +// a given separator 'sep' and returns the result. +// Handles empty strings. +class StrJoin str_list> { + string ret = !foldl("", str_list, a, b, + !if(!eq(a, ""), b, !if(!eq(b, ""), a, !strconcat(a, sep, b)))); +} + // Helper class that represents a 'fragment' of an NVPTX *MMA instruction. // Geom: mnk. E.g. m8n32k16 // Frag: [a|b|c|d] ([x1|x2|x4] for ldmatrix) @@ -5140,6 +5148,11 @@ foreach cta_group = ["cg1", "cg2"] in { [llvm_shared_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>]>; + + def int_nvvm_tcgen05_shift_down_ # cta_group : Intrinsic<[], + [llvm_tmem_ptr_ty], // tmem_addr + [IntrConvergent, IntrArgMemOnly, + NoCapture>]>; } // Tcgen05 wait_ld/st intrinsics @@ -5154,4 +5167,23 @@ def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [], def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; +// Tcgen05 cp intrinsics +foreach cta_group = ["cg1", "cg2"] in { + foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + foreach shape = ["128x256b", "4x256b", "128x128b", + "64x128b_warpx2_02_13", + "64x128b_warpx2_01_23", + "32x128b_warpx4"] in { + defvar intr_suffix = StrJoin<"_", [shape, src_fmt, cta_group]>.ret; + defvar name_suffix = StrJoin<".", [shape, src_fmt, cta_group]>.ret; + + def int_nvvm_tcgen05_cp_ # intr_suffix : Intrinsic<[], + [llvm_tmem_ptr_ty, // tmem_addr + llvm_i64_ty], // smem descriptor + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>], + "llvm.nvvm.tcgen05.cp." # name_suffix>; + } + } +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index a7963543c4350..c6ac341d71a20 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -384,8 +384,8 @@ HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2") HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2") HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2") HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2") -HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee") -HANDLE_LIBCALL(FPROUND_F32_F16, "__gnu_f2h_ieee") +HANDLE_LIBCALL(FPEXT_F16_F32, "__extendhfsf2") +HANDLE_LIBCALL(FPROUND_F32_F16, "__truncsfhf2") HANDLE_LIBCALL(FPROUND_F64_F16, "__truncdfhf2") HANDLE_LIBCALL(FPROUND_F80_F16, "__truncxfhf2") HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2") diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index e414112498798..ffd1c034768de 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/BinaryFormat/DXContainer.h" #include #include @@ -15,12 +16,10 @@ class raw_ostream; namespace mcdxbc { struct RootSignatureDesc { - uint32_t Version = 2; - uint32_t NumParameters = 0; - uint32_t RootParametersOffset = 0; - uint32_t NumStaticSamplers = 0; - uint32_t StaticSamplersOffset = 0; - uint32_t Flags = 0; + + dxbc::RootSignatureHeader Header; + SmallVector Parameters; + RootSignatureDesc() : Header(dxbc::RootSignatureHeader{2, 0}) {} void write(raw_ostream &OS) const; }; diff --git a/llvm/include/llvm/MC/MCRegister.h b/llvm/include/llvm/MC/MCRegister.h index 53005bb03c2ee..16d0709753b35 100644 --- a/llvm/include/llvm/MC/MCRegister.h +++ b/llvm/include/llvm/MC/MCRegister.h @@ -54,14 +54,6 @@ class MCRegister { static constexpr unsigned FirstStackSlot = 1u << 30; static constexpr unsigned VirtualRegFlag = 1u << 31; - /// This is the portion of the positive number space that is not a physical - /// register. StackSlot values do not exist in the MC layer, see - /// Register::isStackSlot() for the more information on them. - /// - static constexpr bool isStackSlot(unsigned Reg) { - return FirstStackSlot <= Reg && Reg < VirtualRegFlag; - } - /// Return true if the specified register number is in /// the physical register namespace. static constexpr bool isPhysicalRegister(unsigned Reg) { diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index c3a2f756bd683..631744bd4fd94 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -119,12 +119,14 @@ namespace DirectX { class RootSignature { private: - uint32_t Version; - uint32_t NumParameters; - uint32_t RootParametersOffset; - uint32_t NumStaticSamplers; - uint32_t StaticSamplersOffset; - uint32_t Flags; + uint32_t Version = 2; + uint32_t NumParameters = 0; + uint32_t RootParametersOffset = 0; + uint32_t NumStaticSamplers = 0; + uint32_t StaticSamplersOffset = 0; + uint32_t Flags = 0; + + SmallVector Parameters; public: RootSignature() {} @@ -135,6 +137,9 @@ class RootSignature { uint32_t getRootParametersOffset() const { return RootParametersOffset; } uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } + const SmallVector &getParameters() const { + return Parameters; + } uint32_t getFlags() const { return Flags; } }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index ecad35e82b155..f1c11379e1fb0 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -79,11 +79,11 @@ struct RootSignatureYamlDesc { RootSignatureYamlDesc(const object::DirectX::RootSignature &Data); uint32_t Version; - uint32_t NumParameters; - uint32_t RootParametersOffset; uint32_t NumStaticSamplers; uint32_t StaticSamplersOffset; + SmallVector Parameters; + uint32_t getEncodedFlags(); #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -192,6 +192,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::ResourceBindInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureElement) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::PSVInfo::MaskVector) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureParameter) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::dxbc::RootParameter) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::SemanticKind) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode) @@ -200,6 +201,8 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::RootParameterType) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::ShaderVisibility) namespace llvm { @@ -264,6 +267,14 @@ template <> struct MappingTraits { DXContainerYAML::RootSignatureYamlDesc &RootSignature); }; +template <> struct MappingTraits { + static void mapping(IO &IO, dxbc::RootParameter &P); +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, dxbc::RootConstants &C); +}; + } // namespace yaml } // namespace llvm diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index 28e33ca0f2312..2e91b96bb22e6 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -9,6 +9,7 @@ #ifndef LLVM_SANDBOXIR_VALUE_H #define LLVM_SANDBOXIR_VALUE_H +#include "llvm/IR/Metadata.h" #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Use.h" @@ -282,6 +283,28 @@ class Value { #endif }; +class OpaqueValue : public Value { +protected: + OpaqueValue(llvm::Value *V, Context &Ctx) + : Value(ClassID::OpaqueValue, V, Ctx) {} + friend class Context; // For constructor. + +public: + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::OpaqueValue; + } +#ifndef NDEBUG + void verify() const override { + assert((isa(Val) || isa(Val)) && + "Expected Metadata or InlineAssembly!"); + } + void dumpOS(raw_ostream &OS) const override { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); + } +#endif // NDEBUG +}; + } // namespace llvm::sandboxir #endif // LLVM_SANDBOXIR_VALUE_H diff --git a/llvm/include/llvm/SandboxIR/Values.def b/llvm/include/llvm/SandboxIR/Values.def index 3d8ad6ce197f4..f5ead54a08e10 100644 --- a/llvm/include/llvm/SandboxIR/Values.def +++ b/llvm/include/llvm/SandboxIR/Values.def @@ -21,6 +21,7 @@ DEF_CONST(Function, Function) DEF_VALUE(Argument, Argument) +DEF_VALUE(OpaqueValue, OpaqueValue) DEF_USER(User, User) DEF_VALUE(Block, BasicBlock) diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index e4345e5739e99..8e9dc29e4f48a 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -162,8 +162,9 @@ class DebugCounter { protected: unsigned addCounter(const std::string &Name, const std::string &Desc) { unsigned Result = RegisteredCounters.insert(Name); - Counters[Result] = {}; - Counters[Result].Desc = Desc; + auto &C = Counters[Result]; + C = {}; + C.Desc = Desc; return Result; } // Struct to store counter info. diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 7f58f5236aedd..eb660844b0b3a 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -326,10 +326,6 @@ inline bool capturesFullProvenance(CaptureComponents CC) { return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance; } -inline bool capturesAll(CaptureComponents CC) { - return CC == CaptureComponents::All; -} - raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC); /// Represents which components of the pointer may be captured in which @@ -354,15 +350,6 @@ class CaptureInfo { /// Create CaptureInfo that may capture all components of the pointer. static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); } - /// Create CaptureInfo that may only capture via the return value. - static CaptureInfo - retOnly(CaptureComponents RetComponents = CaptureComponents::All) { - return CaptureInfo(CaptureComponents::None, RetComponents); - } - - /// Whether the pointer is only captured via the return value. - bool isRetOnly() const { return capturesNothing(OtherComponents); } - /// Get components potentially captured by the return value. CaptureComponents getRetComponents() const { return RetComponents; } diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index 55e7b417428c4..f776b41f3d7ca 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -83,8 +83,6 @@ enum GPUKind : uint32_t { GK_GFX909 = 65, GK_GFX90A = 66, GK_GFX90C = 67, - GK_GFX940 = 68, - GK_GFX941 = 69, GK_GFX942 = 70, GK_GFX950 = 71, diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 58297accc7f1f..1a9136e464d25 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -835,15 +835,9 @@ bool llvm::isBaseOfObject(const Value *V) { } bool llvm::isEscapeSource(const Value *V) { - if (auto *CB = dyn_cast(V)) { - if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, true)) - return false; - - // The return value of a function with a captures(ret: address, provenance) - // attribute is not necessarily an escape source. The return value may - // alias with a non-escaping object. - return !CB->hasArgumentWithAdditionalReturnCaptureComponents(); - } + if (auto *CB = dyn_cast(V)) + return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, + true); // The load case works because isNonEscapingLocalObject considers all // stores to be escapes (it passes true for the StoreCaptures argument @@ -859,6 +853,12 @@ bool llvm::isEscapeSource(const Value *V) { if (isa(V)) return true; + // Capture tracking considers insertions into aggregates and vectors as + // captures. As such, extractions from aggregates and vectors are escape + // sources. + if (isa(V)) + return true; + // Same for inttoptr constant expressions. if (auto *CE = dyn_cast(V)) if (CE->getOpcode() == Instruction::IntToPtr) diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp index 21530693c5f18..c27bfa6f3cc2c 100644 --- a/llvm/lib/Analysis/AssumeBundleQueries.cpp +++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp @@ -85,13 +85,14 @@ void llvm::fillMapFromAssume(AssumeInst &Assume, RetainedKnowledgeMap &Result) { if (!CI) continue; uint64_t Val = CI->getZExtValue(); - auto Lookup = Result.find(Key); - if (Lookup == Result.end() || !Lookup->second.count(&Assume)) { - Result[Key][&Assume] = {Val, Val}; + auto [It, Inserted] = Result[Key].try_emplace(&Assume); + if (Inserted) { + It->second = {Val, Val}; continue; } - Lookup->second[&Assume].Min = std::min(Val, Lookup->second[&Assume].Min); - Lookup->second[&Assume].Max = std::max(Val, Lookup->second[&Assume].Max); + auto &MinMax = It->second; + MinMax.Min = std::min(Val, MinMax.Min); + MinMax.Max = std::max(Val, MinMax.Max); } } diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 5120b910e7896..49baf2eb84bb3 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -81,15 +81,14 @@ struct SimpleCaptureTracker : public CaptureTracker { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { if (isa(U->getUser()) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; LLVM_DEBUG(dbgs() << "Captured by: " << *U->getUser() << "\n"); Captured = true; - return Stop; + return true; } bool ReturnCaptures; @@ -123,21 +122,19 @@ struct CapturesBefore : public CaptureTracker { return !isPotentiallyReachable(I, BeforeHere, nullptr, DT, LI); } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; // Check isSafeToPrune() here rather than in shouldExplore() to avoid // an expensive reachability query for every instruction we look at. // Instead we only do one for actual capturing candidates. if (isSafeToPrune(I)) - // If the use is not reachable, the instruction result isn't either. - return ContinueIgnoringReturn; + return false; Captured = true; - return Stop; + return true; } const Instruction *BeforeHere; @@ -169,11 +166,10 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = &*F.getEntryBlock().begin(); } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; if (!EarliestCapture) EarliestCapture = I; @@ -181,10 +177,9 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = DT.findNearestCommonDominator(EarliestCapture, I); Captured = true; - // Continue analysis, as we need to see all potential captures. However, - // we do not need to follow the instruction result, as this use will - // dominate any captures made through the instruction result.. - return ContinueIgnoringReturn; + // Return false to continue analysis; we need to see all potential + // captures. + return false; } Instruction *EarliestCapture = nullptr; @@ -279,26 +274,25 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, return CB.EarliestCapture; } -UseCaptureInfo llvm::DetermineUseCaptureKind( - const Use &U, const Value *Base, +UseCaptureKind llvm::DetermineUseCaptureKind( + const Use &U, function_ref IsDereferenceableOrNull) { Instruction *I = dyn_cast(U.getUser()); // TODO: Investigate non-instruction uses. if (!I) - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: { - // TODO(captures): Make this more precise. auto *Call = cast(I); // Not captured if the callee is readonly, doesn't return a copy through // its return value and doesn't unwind (a readonly function can leak bits // by throwing an exception or not depending on the input value). if (Call->onlyReadsMemory() && Call->doesNotThrow() && Call->getType()->isVoidTy()) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; // The pointer is not captured if returned pointer is not captured. // NOTE: CaptureTracking users should not assume that only functions @@ -306,13 +300,13 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // getUnderlyingObject in ValueTracking or DecomposeGEPExpression // in BasicAA also need to know about this property. if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true)) - return UseCaptureInfo::passthrough(); + return UseCaptureKind::PASSTHROUGH; // Volatile operations effectively capture the memory location that they // load and store to. if (auto *MI = dyn_cast(Call)) if (MI->isVolatile()) - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; // Calling a function pointer does not in itself cause the pointer to // be captured. This is a subtle point considering that (for example) @@ -321,27 +315,30 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // captured, even though the loaded value might be the pointer itself // (think of self-referential objects). if (Call->isCallee(&U)) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; // Not captured if only passed via 'nocapture' arguments. assert(Call->isDataOperand(&U) && "Non-callee must be data operand"); - CaptureInfo CI = Call->getCaptureInfo(Call->getDataOperandNo(&U)); - return UseCaptureInfo(CI.getOtherComponents(), CI.getRetComponents()); + if (!Call->doesNotCapture(Call->getDataOperandNo(&U))) { + // The parameter is not marked 'nocapture' - captured. + return UseCaptureKind::MAY_CAPTURE; + } + return UseCaptureKind::NO_CAPTURE; } case Instruction::Load: // Volatile loads make the address observable. if (cast(I)->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; case Instruction::VAArg: // "va-arg" from a pointer does not cause it to be captured. - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; case Instruction::Store: // Stored the pointer - conservatively assume it may be captured. // Volatile stores make the address observable. if (U.getOperandNo() == 0 || cast(I)->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; case Instruction::AtomicRMW: { // atomicrmw conceptually includes both a load and store from // the same location. @@ -350,8 +347,8 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ARMWI = cast(I); if (U.getOperandNo() == 1 || ARMWI->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; } case Instruction::AtomicCmpXchg: { // cmpxchg conceptually includes both a load and store from @@ -361,35 +358,31 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ACXI = cast(I); if (U.getOperandNo() == 1 || U.getOperandNo() == 2 || ACXI->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; } case Instruction::GetElementPtr: // AA does not support pointers of vectors, so GEP vector splats need to // be considered as captures. if (I->getType()->isVectorTy()) - return CaptureComponents::All; - return UseCaptureInfo::passthrough(); + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::PASSTHROUGH; case Instruction::BitCast: case Instruction::PHI: case Instruction::Select: case Instruction::AddrSpaceCast: // The original value is not captured via this if the new value isn't. - return UseCaptureInfo::passthrough(); + return UseCaptureKind::PASSTHROUGH; case Instruction::ICmp: { unsigned Idx = U.getOperandNo(); unsigned OtherIdx = 1 - Idx; - if (isa(I->getOperand(OtherIdx)) && - cast(I)->isEquality()) { - // TODO(captures): Remove these special cases once we make use of - // captures(address_is_null). - + if (auto *CPN = dyn_cast(I->getOperand(OtherIdx))) { // Don't count comparisons of a no-alias return value against null as // captures. This allows us to ignore comparisons of malloc results // with null, for example. - if (U->getType()->getPointerAddressSpace() == 0) + if (CPN->getType()->getAddressSpace() == 0) if (isNoAliasCall(U.get()->stripPointerCasts())) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; if (!I->getFunction()->nullPointerIsDefined()) { auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation(); // Comparing a dereferenceable_or_null pointer against null cannot @@ -397,23 +390,17 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // valid (in-bounds) pointer. const DataLayout &DL = I->getDataLayout(); if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL)) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; } - - // Check whether this is a comparison of the base pointer against - // null. - if (U.get() == Base) - return CaptureComponents::AddressIsNull; } // Otherwise, be conservative. There are crazy ways to capture pointers - // using comparisons. However, only the address is captured, not the - // provenance. - return CaptureComponents::Address; + // using comparisons. + return UseCaptureKind::MAY_CAPTURE; } default: // Something else - be conservative and say it is captured. - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; } } @@ -451,26 +438,18 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, }; while (!Worklist.empty()) { const Use *U = Worklist.pop_back_val(); - UseCaptureInfo CI = DetermineUseCaptureKind(*U, V, IsDereferenceableOrNull); - if (capturesAnything(CI.UseCC)) { - switch (Tracker->captured(U, CI)) { - case CaptureTracker::Stop: + switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: + continue; + case UseCaptureKind::MAY_CAPTURE: + if (Tracker->captured(U)) return; - case CaptureTracker::ContinueIgnoringReturn: - continue; - case CaptureTracker::Continue: - // Fall through to passthrough handling, but only if ResultCC contains - // additional components that UseCC does not. We assume that a - // capture at this point will be strictly more constraining than a - // later capture from following the return value. - if (capturesNothing(CI.ResultCC & ~CI.UseCC)) - continue; - break; - } + continue; + case UseCaptureKind::PASSTHROUGH: + if (!AddUses(U->getUser())) + return; + continue; } - // TODO(captures): We could keep track of ResultCC for the users. - if (capturesAnything(CI.ResultCC) && !AddUses(U->getUser())) - return; } // All uses examined. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index d25c1eecaf1ca..59002cd934ab1 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2788,8 +2788,7 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, struct CustomCaptureTracker : public CaptureTracker { bool Captured = false; void tooManyUses() override { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { if (auto *ICmp = dyn_cast(U->getUser())) { // Comparison against value stored in global variable. Given the // pointer does not escape, its value cannot be guessed and stored @@ -2797,11 +2796,11 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, unsigned OtherIdx = 1 - U->getOperandNo(); auto *LI = dyn_cast(ICmp->getOperand(OtherIdx)); if (LI && isa(LI->getPointerOperand())) - return Continue; + return false; } Captured = true; - return Stop; + return true; } }; CustomCaptureTracker Tracker; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 5a22ac8abc3fc..5dc5b025599b1 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -798,8 +798,13 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { // The access function must stride over the innermost loop. if (Lp != AR->getLoop()) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " - << *Ptr << " SCEV: " << *AR << "\n"); + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Not striding over innermost loop "; + if (Ptr) + dbgs() << *Ptr << " "; + + dbgs() << "SCEV: " << *AR << "\n"; + }); return std::nullopt; } @@ -809,8 +814,12 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, // Calculate the pointer stride and check if it is constant. const SCEVConstant *C = dyn_cast(Step); if (!C) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr - << " SCEV: " << *AR << "\n"); + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Not a constant strided "; + if (Ptr) + dbgs() << *Ptr << " "; + dbgs() << "SCEV: " << *AR << "\n"; + }); return std::nullopt; } @@ -837,8 +846,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *L); -/// Check whether \p AR is a non-wrapping AddRec, or if \p Ptr is a non-wrapping -/// GEP. +/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use +/// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, std::optional Stride = std::nullopt) { @@ -846,12 +855,12 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, if (AR->getNoWrapFlags(SCEV::NoWrapMask)) return true; - if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) + if (Ptr && PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) return true; // The address calculation must not wrap. Otherwise, a dependence could be // inverted. - if (isNoWrapGEP(Ptr, PSE, L)) + if (Ptr && isNoWrapGEP(Ptr, PSE, L)) return true; // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, @@ -859,7 +868,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // location will be larger than half the pointer index type space. In that // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. - if (auto *GEP = dyn_cast(Ptr); + if (auto *GEP = dyn_cast_if_present(Ptr); GEP && GEP->hasNoUnsignedSignedWrap()) return true; @@ -875,7 +884,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, return true; } - if (Assume) { + if (Ptr && Assume) { PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" << "LAA: Pointer: " << *Ptr << "\n" @@ -1117,6 +1126,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, SmallVector> TranslatedPtrs = findForkedPointer(PSE, StridesMap, Ptr, TheLoop); + assert(!TranslatedPtrs.empty() && "must have some translated pointers"); /// Check whether all pointers can participate in a runtime bounds check. They /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also @@ -1142,13 +1152,10 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. - if (ShouldCheckWrap) { - // Skip wrap checking when translating pointers. - if (TranslatedPtrs.size() > 1) - return false; - - if (!isNoWrap(PSE, AR, Ptr, AccessTy, TheLoop, Assume)) - return false; + if (ShouldCheckWrap && + !isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, + TheLoop, Assume)) { + return false; } } diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 97ceb16ccf53f..8e7b7d313706a 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,6 +60,26 @@ ArrayRef> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } +#define SHADER_VISIBILITY(Val, Enum) {#Enum, ShaderVisibility::Enum}, + +static const EnumEntry ShaderVisibilityValues[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getShaderVisibility() { + return ArrayRef(ShaderVisibilityValues); +} + +#define ROOT_PARAMETER(Val, Enum) {#Enum, RootParameterType::Enum}, + +static const EnumEntry RootParameterTypes[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getRootParameterTypes() { + return ArrayRef(RootParameterTypes); +} + #define SEMANTIC_KIND(Val, Enum) {#Enum, PSV::SemanticKind::Enum}, static const EnumEntry SemanticKindNames[] = { diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 413d9f68e6cc3..f8f5432c73a0f 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1229,13 +1229,13 @@ static Value *getValueFwdRef(BitcodeReaderValueList &ValueList, unsigned Idx, // This is a reference to a no longer supported constant expression. // Pretend that the constant was deleted, which will replace metadata - // references with undef. + // references with poison. // TODO: This is a rather indirect check. It would be more elegant to use // a separate ErrorInfo for constant materialization failure and thread // the error reporting through getValueFwdRef(). if (Idx < ValueList.size() && ValueList[Idx] && ValueList[Idx]->getType() == Ty) - return UndefValue::get(Ty); + return PoisonValue::get(Ty); return nullptr; } diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 1c603f5988ad1..e8d1aba63afb4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -252,10 +252,10 @@ void EHStreamer::computeCallSiteTable( if (&MBB == &Asm->MF->front() || MBB.isBeginSection()) { // We start a call-site range upon function entry and at the beginning of // every basic block section. - CallSiteRanges.push_back( - {Asm->MBBSectionRanges[MBB.getSectionID()].BeginLabel, - Asm->MBBSectionRanges[MBB.getSectionID()].EndLabel, - Asm->getMBBExceptionSym(MBB), CallSites.size()}); + auto &Range = Asm->MBBSectionRanges[MBB.getSectionID()]; + CallSiteRanges.push_back({Range.BeginLabel, Range.EndLabel, + Asm->getMBBExceptionSym(MBB), + CallSites.size()}); PreviousIsInvoke = false; SawPotentiallyThrowing = false; LastLabel = nullptr; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d4cb224c35d74..319c4ac28c167 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8137,14 +8137,14 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) { } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { +LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI, bool ShouldCanonicalize) { unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ? TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE; auto [Dst, Src0, Src1] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Dst); - if (!MI.getFlag(MachineInstr::FmNoNans)) { + if (ShouldCanonicalize && !MI.getFlag(MachineInstr::FmNoNans)) { // Insert canonicalizes if it's possible we need to quiet to get correct // sNaN behavior. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bc7cdf38dbc2a..f52447b86a7e4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16145,7 +16145,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // also recursively replace t184 by t150. SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo); // Don't replace every single UNDEF everywhere with frozen UNDEF, though. - if (MaybePoisonOperand.getOpcode() == ISD::UNDEF) + if (MaybePoisonOperand.isUndef()) continue; // First, freeze each offending operand. SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand); @@ -16173,7 +16173,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { SmallVector Ops(N0->ops()); // Special-handle ISD::UNDEF, each single one of them can be it's own thing. for (SDValue &Op : Ops) { - if (Op.getOpcode() == ISD::UNDEF) + if (Op.isUndef()) Op = DAG.getFreeze(Op); } @@ -24289,7 +24289,7 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { if (ISD::BITCAST == Op.getOpcode() && !Op.getOperand(0).getValueType().isVector()) Ops.push_back(Op.getOperand(0)); - else if (ISD::UNDEF == Op.getOpcode()) + else if (Op.isUndef()) Ops.push_back(DAG.getNode(ISD::UNDEF, DL, SVT)); else return SDValue(); @@ -24684,7 +24684,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) auto IsBuildVectorOrUndef = [](const SDValue &Op) { - return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); + return Op.isUndef() || ISD::BUILD_VECTOR == Op.getOpcode(); }; if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { SmallVector Opnds; @@ -24708,7 +24708,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { EVT OpVT = Op.getValueType(); unsigned NumElts = OpVT.getVectorNumElements(); - if (ISD::UNDEF == Op.getOpcode()) + if (Op.isUndef()) Opnds.append(NumElts, DAG.getUNDEF(MinVT)); if (ISD::BUILD_VECTOR == Op.getOpcode()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 80c2de1d99542..de092cba333c2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6285,7 +6285,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, Flags.setNonNeg(N1->getFlags().hasNonNeg()); return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); break; @@ -6305,7 +6305,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, Flags.setNonNeg(N1->getFlags().hasNonNeg()); return getNode(ISD::ZERO_EXTEND, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); @@ -6347,7 +6347,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // (ext (trunc x)) -> x @@ -6382,7 +6382,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0)); return N1.getOperand(0); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); if (OpOpcode == ISD::VSCALE && !NewNodesMustHaveLegalTypes) return getVScale(DL, VT, @@ -6400,14 +6400,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::ABS: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid ABS!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getConstant(0, DL, VT); break; case ISD::BSWAP: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BSWAP!"); assert((VT.getScalarSizeInBits() % 16 == 0) && "BSWAP types must be a multiple of 16 bits!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // bswap(bswap(X)) -> X. if (OpOpcode == ISD::BSWAP) @@ -6415,7 +6415,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::BITREVERSE: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BITREVERSE!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::BITCAST: @@ -6424,7 +6424,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == N1.getValueType()) return N1; // noop conversion. if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) return getNode(ISD::BITCAST, DL, VT, N1.getOperand(0)); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::SCALAR_TO_VECTOR: @@ -6434,7 +6434,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N1.getValueType().isInteger() && VT.getVectorElementType().bitsLE(N1.getValueType()))) && "Illegal SCALAR_TO_VECTOR node!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined. if (OpOpcode == ISD::EXTRACT_VECTOR_ELT && @@ -6445,7 +6445,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::FNEG: // Negation of an unknown bag of bits is still completely undefined. - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); if (OpOpcode == ISD::FNEG) // --X -> X @@ -13364,7 +13364,7 @@ void BuildVectorSDNode::recastRawBits(bool IsLittleEndian, bool BuildVectorSDNode::isConstant() const { for (const SDValue &Op : op_values()) { unsigned Opc = Op.getOpcode(); - if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP) + if (!Op.isUndef() && Opc != ISD::Constant && Opc != ISD::ConstantFP) return false; } return true; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 7771958f5adc9..5804a42172a7b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8488,7 +8488,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, } SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, - SelectionDAG &DAG) const { + SelectionDAG &DAG, + bool ShouldCanonicalize) const { if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG)) return Expanded; @@ -8505,7 +8506,7 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SDValue Quiet0 = Node->getOperand(0); SDValue Quiet1 = Node->getOperand(1); - if (!Node->getFlags().hasNoNaNs()) { + if (ShouldCanonicalize && !Node->getFlags().hasNoNaNs()) { // Insert canonicalizes if it's possible we need to quiet to get correct // sNaN behavior. if (!DAG.isKnownNeverSNaN(Quiet0)) { diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index b98523cac1f2f..1970716485613 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -251,15 +251,15 @@ void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State, const BasicBlock *BB = WI->Block; int State = WI->State; delete WI; - if (auto It = EHInfo.BlockToStateMap.find(BB); - It != EHInfo.BlockToStateMap.end() && It->second <= State) + auto [StateIt, Inserted] = EHInfo.BlockToStateMap.try_emplace(BB); + if (!Inserted && StateIt->second <= State) continue; // skip blocks already visited by lower State BasicBlock::const_iterator It = BB->getFirstNonPHIIt(); const llvm::Instruction *TI = BB->getTerminator(); if (It->isEHPad()) State = EHInfo.EHPadStateMap[&*It]; - EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting + StateIt->second = State; // Record state, also flag visiting if ((isa(TI) || isa(TI)) && State > 0) { // Retrive the new State diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b5d1bc81b9d95..e2d607368e94b 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -711,20 +711,6 @@ CaptureInfo CallBase::getCaptureInfo(unsigned OpNo) const { return OBU.isDeoptOperandBundle() ? CaptureInfo::none() : CaptureInfo::all(); } -bool CallBase::hasArgumentWithAdditionalReturnCaptureComponents() const { - for (unsigned I = 0, E = arg_size(); I < E; ++I) { - if (!getArgOperand(I)->getType()->isPointerTy()) - continue; - - CaptureInfo CI = getParamAttributes(I).getCaptureInfo(); - if (auto *Fn = dyn_cast(getCalledOperand())) - CI &= Fn->getAttributes().getParamAttrs(I).getCaptureInfo(); - if (capturesAnything(CI.getRetComponents() & ~CI.getOtherComponents())) - return true; - } - return false; -} - //===----------------------------------------------------------------------===// // CallInst Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index e38fce764b640..1f94400f7c088 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -170,9 +170,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { // TODO: BridgeOS should be included in isOSDarwin. setLibcallName(RTLIB::EXP10_F32, "__exp10f"); setLibcallName(RTLIB::EXP10_F64, "__exp10"); - } else { - setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); } if (TT.isGNUEnvironment() || TT.isOSFuchsia() || diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index b6f2b85bac74e..1db8f55a31658 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -12,12 +12,60 @@ using namespace llvm; using namespace llvm::mcdxbc; +static uint32_t writePlaceholder(raw_ostream &Stream) { + const uint32_t DummyValue = std::numeric_limits::max(); + uint32_t Offset = Stream.tell(); + support::endian::write(Stream, DummyValue, llvm::endianness::little); + return Offset; +} + +static void rewriteOffset(buffer_ostream &Stream, uint32_t Offset) { + uint32_t Value = + support::endian::byte_swap( + Stream.tell()); + Stream.pwrite(reinterpret_cast(&Value), sizeof(Value), Offset); +} + void RootSignatureDesc::write(raw_ostream &OS) const { + buffer_ostream BOS(OS); + const uint32_t NumParameters = Parameters.size(); + const uint32_t Zero = 0; + + support::endian::write(BOS, Header.Version, llvm::endianness::little); + support::endian::write(BOS, NumParameters, llvm::endianness::little); + + uint32_t HeaderPoint = writePlaceholder(BOS); + + support::endian::write(BOS, Zero, llvm::endianness::little); + support::endian::write(BOS, Zero, llvm::endianness::little); + support::endian::write(BOS, Header.Flags, llvm::endianness::little); + + rewriteOffset(BOS, HeaderPoint); + + SmallVector ParamsOffsets; + for (const auto &P : Parameters) { + support::endian::write(BOS, P.ParameterType, llvm::endianness::little); + support::endian::write(BOS, P.ShaderVisibility, llvm::endianness::little); + + ParamsOffsets.push_back(writePlaceholder(BOS)); + } + + assert(NumParameters == ParamsOffsets.size()); + for (size_t I = 0; I < NumParameters; ++I) { + rewriteOffset(BOS, ParamsOffsets[I]); + const auto &P = Parameters[I]; - support::endian::write(OS, Version, llvm::endianness::little); - support::endian::write(OS, NumParameters, llvm::endianness::little); - support::endian::write(OS, RootParametersOffset, llvm::endianness::little); - support::endian::write(OS, NumStaticSamplers, llvm::endianness::little); - support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); - support::endian::write(OS, Flags, llvm::endianness::little); + switch (P.ParameterType) { + case dxbc::RootParameterType::Constants32Bit: { + support::endian::write(BOS, P.Constants.ShaderRegister, + llvm::endianness::little); + support::endian::write(BOS, P.Constants.RegisterSpace, + llvm::endianness::little); + support::endian::write(BOS, P.Constants.Num32BitValues, + llvm::endianness::little); + } break; + case dxbc::RootParameterType::Empty: + llvm_unreachable("Invalid RootParameterType"); + } + } } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 1eb1453c65147..1a0463e8ac850 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -247,6 +247,7 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { } Error DirectX::RootSignature::parse(StringRef Data) { + const char *Begin = Data.begin(); const char *Current = Data.begin(); // Root Signature headers expects 6 integers to be present. @@ -288,6 +289,48 @@ Error DirectX::RootSignature::parse(StringRef Data) { llvm::Twine(FValue)); Flags = FValue; + Current = Begin + RootParametersOffset; + for (uint32_t It = 0; It < NumParameters; It++) { + dxbc::RootParameter NewParam; + + NewParam.ParameterType = + support::endian::read(Current); + if (!dxbc::RootSignatureValidations::isValidParameterType( + NewParam.ParameterType)) + return validationFailed("unsupported parameter type value read: " + + llvm::Twine((uint32_t)NewParam.ParameterType)); + + Current += sizeof(dxbc::RootParameterType); + + NewParam.ShaderVisibility = + support::endian::read( + Current); + if (!dxbc::RootSignatureValidations::isValidShaderVisibility( + NewParam.ShaderVisibility)) + return validationFailed("unsupported shader visility flag value read: " + + llvm::Twine((uint32_t)NewParam.ShaderVisibility)); + + Current += sizeof(dxbc::ShaderVisibility); + + uint32_t Offset = + support::endian::read(Current); + Current += sizeof(uint32_t); + + switch (NewParam.ParameterType) { + + case dxbc::RootParameterType::Constants32Bit: + if (Error Err = readStruct(Data, Begin + Offset, NewParam.Constants)) + return Err; + break; + case dxbc::RootParameterType::Empty: + // unreachable because it was validated and assigned before this point. + llvm_unreachable("Invalid value for RootParameterType"); + } + + Parameters.push_back(NewParam); + } + return Error::success(); } diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 2d3d70db50c39..ac25d76709726 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -545,10 +545,6 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx90a"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: return "gfx90c"; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: - return "gfx940"; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: - return "gfx941"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: return "gfx942"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp index 7806953aecd29..a55005e689e62 100644 --- a/llvm/lib/Object/GOFFObjectFile.cpp +++ b/llvm/lib/Object/GOFFObjectFile.cpp @@ -503,8 +503,9 @@ GOFFObjectFile::getSectionContents(DataRefImpl Sec) const { std::copy(CompleteData.data(), CompleteData.data() + TxtDataSize, Data.begin() + TxtDataOffset); } - SectionDataCache[Sec.d.a] = Data; - return ArrayRef(SectionDataCache[Sec.d.a]); + auto &Cache = SectionDataCache[Sec.d.a]; + Cache = Data; + return ArrayRef(Cache); } uint64_t GOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const { diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index f6ed09c857bb7..87ba16fd40ba9 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -267,12 +267,9 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { continue; mcdxbc::RootSignatureDesc RS; - RS.Flags = P.RootSignature->getEncodedFlags(); - RS.Version = P.RootSignature->Version; - RS.NumParameters = P.RootSignature->NumParameters; - RS.RootParametersOffset = P.RootSignature->RootParametersOffset; - RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers; - RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; + RS.Header.Flags = P.RootSignature->getEncodedFlags(); + RS.Header.Version = P.RootSignature->Version; + RS.Parameters = std::move(P.RootSignature->Parameters); RS.write(OS); break; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index f03c7da65999d..bab2cb550be73 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -31,11 +31,11 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureYamlDesc::RootSignatureYamlDesc( const object::DirectX::RootSignature &Data) - : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), - RootParametersOffset(Data.getRootParametersOffset()), + : Version(Data.getVersion()), NumStaticSamplers(Data.getNumStaticSamplers()), StaticSamplersOffset(Data.getStaticSamplersOffset()) { uint32_t Flags = Data.getFlags(); + Parameters = Data.getParameters(); #define ROOT_ELEMENT_FLAG(Num, Val) \ Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -212,14 +212,33 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) { IO.mapRequired("Version", S.Version); - IO.mapRequired("NumParameters", S.NumParameters); - IO.mapRequired("RootParametersOffset", S.RootParametersOffset); IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); + IO.mapRequired("Parameters", S.Parameters); #define ROOT_ELEMENT_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } +void MappingTraits::mapping(IO &IO, + dxbc::RootConstants &C) { + IO.mapRequired("Num32BitValues", C.Num32BitValues); + IO.mapRequired("RegisterSpace", C.RegisterSpace); + IO.mapRequired("ShaderRegister", C.ShaderRegister); +} + +void MappingTraits::mapping(IO &IO, + dxbc::RootParameter &P) { + IO.mapRequired("ParameterType", P.ParameterType); + IO.mapRequired("ShaderVisibility", P.ShaderVisibility); + switch (P.ParameterType) { + case dxbc::RootParameterType::Constants32Bit: + IO.mapRequired("Constants", P.Constants); + break; + case dxbc::RootParameterType::Empty: + llvm_unreachable("Invalid value for ParameterType"); + } +} + void MappingTraits::mapping(IO &IO, DXContainerYAML::Part &P) { IO.mapRequired("Name", P.Name); @@ -323,6 +342,18 @@ void ScalarEnumerationTraits::enumeration( IO.enumCase(Value, E.Name.str().c_str(), E.Value); } +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::RootParameterType &Value) { + for (const auto &E : dxbc::getRootParameterTypes()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::ShaderVisibility &Value) { + for (const auto &E : dxbc::getShaderVisibility()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + } // namespace yaml void DXContainerYAML::PSVInfo::mapInfoForVersion(yaml::IO &IO) { diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 05e4d85b2ea5d..1f970739c1e7e 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -609,8 +609,6 @@ void ScalarBitSetTraits::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH); - BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH); - BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); diff --git a/llvm/lib/SandboxIR/BasicBlock.cpp b/llvm/lib/SandboxIR/BasicBlock.cpp index 983a5e8b8825e..b45c046402487 100644 --- a/llvm/lib/SandboxIR/BasicBlock.cpp +++ b/llvm/lib/SandboxIR/BasicBlock.cpp @@ -67,12 +67,6 @@ void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) { // Skip instruction's label operands if (isa(Op)) continue; - // Skip metadata - if (isa(Op)) - continue; - // Skip asm - if (isa(Op)) - continue; Ctx.getOrCreateValue(Op); } } diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index 6a397b02d6bde..21039ce7ed834 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Context.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" #include "llvm/SandboxIR/Module.h" @@ -58,26 +59,264 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { if (!Pair.second) return It->second.get(); - if (auto *C = dyn_cast(LLVMV)) { - switch (C->getValueID()) { + // Instruction + if (auto *LLVMI = dyn_cast(LLVMV)) { + switch (LLVMI->getOpcode()) { + case llvm::Instruction::VAArg: { + auto *LLVMVAArg = cast(LLVMV); + It->second = std::unique_ptr(new VAArgInst(LLVMVAArg, *this)); + return It->second.get(); + } + case llvm::Instruction::Freeze: { + auto *LLVMFreeze = cast(LLVMV); + It->second = + std::unique_ptr(new FreezeInst(LLVMFreeze, *this)); + return It->second.get(); + } + case llvm::Instruction::Fence: { + auto *LLVMFence = cast(LLVMV); + It->second = std::unique_ptr(new FenceInst(LLVMFence, *this)); + return It->second.get(); + } + case llvm::Instruction::Select: { + auto *LLVMSel = cast(LLVMV); + It->second = std::unique_ptr(new SelectInst(LLVMSel, *this)); + return It->second.get(); + } + case llvm::Instruction::ExtractElement: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ExtractElementInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::InsertElement: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new InsertElementInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::ShuffleVector: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ShuffleVectorInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::ExtractValue: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ExtractValueInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::InsertValue: { + auto *LLVMIns = cast(LLVMV); + It->second = + std::unique_ptr(new InsertValueInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::Br: { + auto *LLVMBr = cast(LLVMV); + It->second = std::unique_ptr(new BranchInst(LLVMBr, *this)); + return It->second.get(); + } + case llvm::Instruction::Load: { + auto *LLVMLd = cast(LLVMV); + It->second = std::unique_ptr(new LoadInst(LLVMLd, *this)); + return It->second.get(); + } + case llvm::Instruction::Store: { + auto *LLVMSt = cast(LLVMV); + It->second = std::unique_ptr(new StoreInst(LLVMSt, *this)); + return It->second.get(); + } + case llvm::Instruction::Ret: { + auto *LLVMRet = cast(LLVMV); + It->second = std::unique_ptr(new ReturnInst(LLVMRet, *this)); + return It->second.get(); + } + case llvm::Instruction::Call: { + auto *LLVMCall = cast(LLVMV); + It->second = std::unique_ptr(new CallInst(LLVMCall, *this)); + return It->second.get(); + } + case llvm::Instruction::Invoke: { + auto *LLVMInvoke = cast(LLVMV); + It->second = + std::unique_ptr(new InvokeInst(LLVMInvoke, *this)); + return It->second.get(); + } + case llvm::Instruction::CallBr: { + auto *LLVMCallBr = cast(LLVMV); + It->second = + std::unique_ptr(new CallBrInst(LLVMCallBr, *this)); + return It->second.get(); + } + case llvm::Instruction::LandingPad: { + auto *LLVMLPad = cast(LLVMV); + It->second = + std::unique_ptr(new LandingPadInst(LLVMLPad, *this)); + return It->second.get(); + } + case llvm::Instruction::CatchPad: { + auto *LLVMCPI = cast(LLVMV); + It->second = + std::unique_ptr(new CatchPadInst(LLVMCPI, *this)); + return It->second.get(); + } + case llvm::Instruction::CleanupPad: { + auto *LLVMCPI = cast(LLVMV); + It->second = + std::unique_ptr(new CleanupPadInst(LLVMCPI, *this)); + return It->second.get(); + } + case llvm::Instruction::CatchRet: { + auto *LLVMCRI = cast(LLVMV); + It->second = + std::unique_ptr(new CatchReturnInst(LLVMCRI, *this)); + return It->second.get(); + } + case llvm::Instruction::CleanupRet: { + auto *LLVMCRI = cast(LLVMV); + It->second = std::unique_ptr( + new CleanupReturnInst(LLVMCRI, *this)); + return It->second.get(); + } + case llvm::Instruction::GetElementPtr: { + auto *LLVMGEP = cast(LLVMV); + It->second = std::unique_ptr( + new GetElementPtrInst(LLVMGEP, *this)); + return It->second.get(); + } + case llvm::Instruction::CatchSwitch: { + auto *LLVMCatchSwitchInst = cast(LLVMV); + It->second = std::unique_ptr( + new CatchSwitchInst(LLVMCatchSwitchInst, *this)); + return It->second.get(); + } + case llvm::Instruction::Resume: { + auto *LLVMResumeInst = cast(LLVMV); + It->second = + std::unique_ptr(new ResumeInst(LLVMResumeInst, *this)); + return It->second.get(); + } + case llvm::Instruction::Switch: { + auto *LLVMSwitchInst = cast(LLVMV); + It->second = + std::unique_ptr(new SwitchInst(LLVMSwitchInst, *this)); + return It->second.get(); + } + case llvm::Instruction::FNeg: { + auto *LLVMUnaryOperator = cast(LLVMV); + It->second = std::unique_ptr( + new UnaryOperator(LLVMUnaryOperator, *this)); + return It->second.get(); + } + case llvm::Instruction::Add: + case llvm::Instruction::FAdd: + case llvm::Instruction::Sub: + case llvm::Instruction::FSub: + case llvm::Instruction::Mul: + case llvm::Instruction::FMul: + case llvm::Instruction::UDiv: + case llvm::Instruction::SDiv: + case llvm::Instruction::FDiv: + case llvm::Instruction::URem: + case llvm::Instruction::SRem: + case llvm::Instruction::FRem: + case llvm::Instruction::Shl: + case llvm::Instruction::LShr: + case llvm::Instruction::AShr: + case llvm::Instruction::And: + case llvm::Instruction::Or: + case llvm::Instruction::Xor: { + auto *LLVMBinaryOperator = cast(LLVMV); + It->second = std::unique_ptr( + new BinaryOperator(LLVMBinaryOperator, *this)); + return It->second.get(); + } + case llvm::Instruction::AtomicRMW: { + auto *LLVMAtomicRMW = cast(LLVMV); + It->second = std::unique_ptr( + new AtomicRMWInst(LLVMAtomicRMW, *this)); + return It->second.get(); + } + case llvm::Instruction::AtomicCmpXchg: { + auto *LLVMAtomicCmpXchg = cast(LLVMV); + It->second = std::unique_ptr( + new AtomicCmpXchgInst(LLVMAtomicCmpXchg, *this)); + return It->second.get(); + } + case llvm::Instruction::Alloca: { + auto *LLVMAlloca = cast(LLVMV); + It->second = + std::unique_ptr(new AllocaInst(LLVMAlloca, *this)); + return It->second.get(); + } + case llvm::Instruction::ZExt: + case llvm::Instruction::SExt: + case llvm::Instruction::FPToUI: + case llvm::Instruction::FPToSI: + case llvm::Instruction::FPExt: + case llvm::Instruction::PtrToInt: + case llvm::Instruction::IntToPtr: + case llvm::Instruction::SIToFP: + case llvm::Instruction::UIToFP: + case llvm::Instruction::Trunc: + case llvm::Instruction::FPTrunc: + case llvm::Instruction::BitCast: + case llvm::Instruction::AddrSpaceCast: { + auto *LLVMCast = cast(LLVMV); + It->second = std::unique_ptr(new CastInst(LLVMCast, *this)); + return It->second.get(); + } + case llvm::Instruction::PHI: { + auto *LLVMPhi = cast(LLVMV); + It->second = std::unique_ptr(new PHINode(LLVMPhi, *this)); + return It->second.get(); + } + case llvm::Instruction::ICmp: { + auto *LLVMICmp = cast(LLVMV); + It->second = std::unique_ptr(new ICmpInst(LLVMICmp, *this)); + return It->second.get(); + } + case llvm::Instruction::FCmp: { + auto *LLVMFCmp = cast(LLVMV); + It->second = std::unique_ptr(new FCmpInst(LLVMFCmp, *this)); + return It->second.get(); + } + case llvm::Instruction::Unreachable: { + auto *LLVMUnreachable = cast(LLVMV); + It->second = std::unique_ptr( + new UnreachableInst(LLVMUnreachable, *this)); + return It->second.get(); + } + default: + break; + } + It->second = std::unique_ptr( + new OpaqueInst(cast(LLVMV), *this)); + return It->second.get(); + } + // Constant + if (auto *LLVMC = dyn_cast(LLVMV)) { + switch (LLVMC->getValueID()) { case llvm::Value::ConstantIntVal: It->second = std::unique_ptr( - new ConstantInt(cast(C), *this)); + new ConstantInt(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::ConstantFPVal: It->second = std::unique_ptr( - new ConstantFP(cast(C), *this)); + new ConstantFP(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::BlockAddressVal: It->second = std::unique_ptr( - new BlockAddress(cast(C), *this)); + new BlockAddress(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::ConstantTokenNoneVal: It->second = std::unique_ptr( - new ConstantTokenNone(cast(C), *this)); + new ConstantTokenNone(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::ConstantAggregateZeroVal: { - auto *CAZ = cast(C); + auto *CAZ = cast(LLVMC); It->second = std::unique_ptr( new ConstantAggregateZero(CAZ, *this)); auto *Ret = It->second.get(); @@ -90,19 +329,19 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { return Ret; } case llvm::Value::ConstantPointerNullVal: - It->second = std::unique_ptr( - new ConstantPointerNull(cast(C), *this)); + It->second = std::unique_ptr(new ConstantPointerNull( + cast(LLVMC), *this)); return It->second.get(); case llvm::Value::PoisonValueVal: It->second = std::unique_ptr( - new PoisonValue(cast(C), *this)); + new PoisonValue(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::UndefValueVal: It->second = std::unique_ptr( - new UndefValue(cast(C), *this)); + new UndefValue(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::DSOLocalEquivalentVal: { - auto *DSOLE = cast(C); + auto *DSOLE = cast(LLVMC); It->second = std::unique_ptr( new DSOLocalEquivalent(DSOLE, *this)); auto *Ret = It->second.get(); @@ -111,297 +350,77 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { } case llvm::Value::ConstantArrayVal: It->second = std::unique_ptr( - new ConstantArray(cast(C), *this)); + new ConstantArray(cast(LLVMC), *this)); break; case llvm::Value::ConstantStructVal: It->second = std::unique_ptr( - new ConstantStruct(cast(C), *this)); + new ConstantStruct(cast(LLVMC), *this)); break; case llvm::Value::ConstantVectorVal: It->second = std::unique_ptr( - new ConstantVector(cast(C), *this)); + new ConstantVector(cast(LLVMC), *this)); break; case llvm::Value::FunctionVal: It->second = std::unique_ptr( - new Function(cast(C), *this)); + new Function(cast(LLVMC), *this)); break; case llvm::Value::GlobalIFuncVal: It->second = std::unique_ptr( - new GlobalIFunc(cast(C), *this)); + new GlobalIFunc(cast(LLVMC), *this)); break; case llvm::Value::GlobalVariableVal: It->second = std::unique_ptr( - new GlobalVariable(cast(C), *this)); + new GlobalVariable(cast(LLVMC), *this)); break; case llvm::Value::GlobalAliasVal: It->second = std::unique_ptr( - new GlobalAlias(cast(C), *this)); + new GlobalAlias(cast(LLVMC), *this)); break; case llvm::Value::NoCFIValueVal: It->second = std::unique_ptr( - new NoCFIValue(cast(C), *this)); + new NoCFIValue(cast(LLVMC), *this)); break; case llvm::Value::ConstantPtrAuthVal: It->second = std::unique_ptr( - new ConstantPtrAuth(cast(C), *this)); + new ConstantPtrAuth(cast(LLVMC), *this)); break; case llvm::Value::ConstantExprVal: It->second = std::unique_ptr( - new ConstantExpr(cast(C), *this)); + new ConstantExpr(cast(LLVMC), *this)); break; default: - It->second = std::unique_ptr(new Constant(C, *this)); + It->second = std::unique_ptr(new Constant(LLVMC, *this)); break; } auto *NewC = It->second.get(); - for (llvm::Value *COp : C->operands()) - getOrCreateValueInternal(COp, C); + for (llvm::Value *COp : LLVMC->operands()) + getOrCreateValueInternal(COp, LLVMC); return NewC; } - if (auto *Arg = dyn_cast(LLVMV)) { - It->second = std::unique_ptr(new Argument(Arg, *this)); + // Argument + if (auto *LLVMArg = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new Argument(LLVMArg, *this)); return It->second.get(); } - if (auto *BB = dyn_cast(LLVMV)) { + // BasicBlock + if (auto *LLVMBB = dyn_cast(LLVMV)) { assert(isa(U) && "This won't create a SBBB, don't call this function directly!"); - if (auto *SBBB = getValue(BB)) + if (auto *SBBB = getValue(LLVMBB)) return SBBB; return nullptr; } - assert(isa(LLVMV) && "Expected Instruction"); - - switch (cast(LLVMV)->getOpcode()) { - case llvm::Instruction::VAArg: { - auto *LLVMVAArg = cast(LLVMV); - It->second = std::unique_ptr(new VAArgInst(LLVMVAArg, *this)); - return It->second.get(); - } - case llvm::Instruction::Freeze: { - auto *LLVMFreeze = cast(LLVMV); - It->second = std::unique_ptr(new FreezeInst(LLVMFreeze, *this)); - return It->second.get(); - } - case llvm::Instruction::Fence: { - auto *LLVMFence = cast(LLVMV); - It->second = std::unique_ptr(new FenceInst(LLVMFence, *this)); - return It->second.get(); - } - case llvm::Instruction::Select: { - auto *LLVMSel = cast(LLVMV); - It->second = std::unique_ptr(new SelectInst(LLVMSel, *this)); + // Metadata + if (auto *LLVMMD = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new OpaqueValue(LLVMMD, *this)); return It->second.get(); } - case llvm::Instruction::ExtractElement: { - auto *LLVMIns = cast(LLVMV); - It->second = std::unique_ptr( - new ExtractElementInst(LLVMIns, *this)); + // InlineAsm + if (auto *LLVMAsm = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new OpaqueValue(LLVMAsm, *this)); return It->second.get(); } - case llvm::Instruction::InsertElement: { - auto *LLVMIns = cast(LLVMV); - It->second = std::unique_ptr( - new InsertElementInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::ShuffleVector: { - auto *LLVMIns = cast(LLVMV); - It->second = std::unique_ptr( - new ShuffleVectorInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::ExtractValue: { - auto *LLVMIns = cast(LLVMV); - It->second = - std::unique_ptr(new ExtractValueInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::InsertValue: { - auto *LLVMIns = cast(LLVMV); - It->second = - std::unique_ptr(new InsertValueInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::Br: { - auto *LLVMBr = cast(LLVMV); - It->second = std::unique_ptr(new BranchInst(LLVMBr, *this)); - return It->second.get(); - } - case llvm::Instruction::Load: { - auto *LLVMLd = cast(LLVMV); - It->second = std::unique_ptr(new LoadInst(LLVMLd, *this)); - return It->second.get(); - } - case llvm::Instruction::Store: { - auto *LLVMSt = cast(LLVMV); - It->second = std::unique_ptr(new StoreInst(LLVMSt, *this)); - return It->second.get(); - } - case llvm::Instruction::Ret: { - auto *LLVMRet = cast(LLVMV); - It->second = std::unique_ptr(new ReturnInst(LLVMRet, *this)); - return It->second.get(); - } - case llvm::Instruction::Call: { - auto *LLVMCall = cast(LLVMV); - It->second = std::unique_ptr(new CallInst(LLVMCall, *this)); - return It->second.get(); - } - case llvm::Instruction::Invoke: { - auto *LLVMInvoke = cast(LLVMV); - It->second = std::unique_ptr(new InvokeInst(LLVMInvoke, *this)); - return It->second.get(); - } - case llvm::Instruction::CallBr: { - auto *LLVMCallBr = cast(LLVMV); - It->second = std::unique_ptr(new CallBrInst(LLVMCallBr, *this)); - return It->second.get(); - } - case llvm::Instruction::LandingPad: { - auto *LLVMLPad = cast(LLVMV); - It->second = - std::unique_ptr(new LandingPadInst(LLVMLPad, *this)); - return It->second.get(); - } - case llvm::Instruction::CatchPad: { - auto *LLVMCPI = cast(LLVMV); - It->second = - std::unique_ptr(new CatchPadInst(LLVMCPI, *this)); - return It->second.get(); - } - case llvm::Instruction::CleanupPad: { - auto *LLVMCPI = cast(LLVMV); - It->second = - std::unique_ptr(new CleanupPadInst(LLVMCPI, *this)); - return It->second.get(); - } - case llvm::Instruction::CatchRet: { - auto *LLVMCRI = cast(LLVMV); - It->second = - std::unique_ptr(new CatchReturnInst(LLVMCRI, *this)); - return It->second.get(); - } - case llvm::Instruction::CleanupRet: { - auto *LLVMCRI = cast(LLVMV); - It->second = std::unique_ptr( - new CleanupReturnInst(LLVMCRI, *this)); - return It->second.get(); - } - case llvm::Instruction::GetElementPtr: { - auto *LLVMGEP = cast(LLVMV); - It->second = std::unique_ptr( - new GetElementPtrInst(LLVMGEP, *this)); - return It->second.get(); - } - case llvm::Instruction::CatchSwitch: { - auto *LLVMCatchSwitchInst = cast(LLVMV); - It->second = std::unique_ptr( - new CatchSwitchInst(LLVMCatchSwitchInst, *this)); - return It->second.get(); - } - case llvm::Instruction::Resume: { - auto *LLVMResumeInst = cast(LLVMV); - It->second = - std::unique_ptr(new ResumeInst(LLVMResumeInst, *this)); - return It->second.get(); - } - case llvm::Instruction::Switch: { - auto *LLVMSwitchInst = cast(LLVMV); - It->second = - std::unique_ptr(new SwitchInst(LLVMSwitchInst, *this)); - return It->second.get(); - } - case llvm::Instruction::FNeg: { - auto *LLVMUnaryOperator = cast(LLVMV); - It->second = std::unique_ptr( - new UnaryOperator(LLVMUnaryOperator, *this)); - return It->second.get(); - } - case llvm::Instruction::Add: - case llvm::Instruction::FAdd: - case llvm::Instruction::Sub: - case llvm::Instruction::FSub: - case llvm::Instruction::Mul: - case llvm::Instruction::FMul: - case llvm::Instruction::UDiv: - case llvm::Instruction::SDiv: - case llvm::Instruction::FDiv: - case llvm::Instruction::URem: - case llvm::Instruction::SRem: - case llvm::Instruction::FRem: - case llvm::Instruction::Shl: - case llvm::Instruction::LShr: - case llvm::Instruction::AShr: - case llvm::Instruction::And: - case llvm::Instruction::Or: - case llvm::Instruction::Xor: { - auto *LLVMBinaryOperator = cast(LLVMV); - It->second = std::unique_ptr( - new BinaryOperator(LLVMBinaryOperator, *this)); - return It->second.get(); - } - case llvm::Instruction::AtomicRMW: { - auto *LLVMAtomicRMW = cast(LLVMV); - It->second = - std::unique_ptr(new AtomicRMWInst(LLVMAtomicRMW, *this)); - return It->second.get(); - } - case llvm::Instruction::AtomicCmpXchg: { - auto *LLVMAtomicCmpXchg = cast(LLVMV); - It->second = std::unique_ptr( - new AtomicCmpXchgInst(LLVMAtomicCmpXchg, *this)); - return It->second.get(); - } - case llvm::Instruction::Alloca: { - auto *LLVMAlloca = cast(LLVMV); - It->second = std::unique_ptr(new AllocaInst(LLVMAlloca, *this)); - return It->second.get(); - } - case llvm::Instruction::ZExt: - case llvm::Instruction::SExt: - case llvm::Instruction::FPToUI: - case llvm::Instruction::FPToSI: - case llvm::Instruction::FPExt: - case llvm::Instruction::PtrToInt: - case llvm::Instruction::IntToPtr: - case llvm::Instruction::SIToFP: - case llvm::Instruction::UIToFP: - case llvm::Instruction::Trunc: - case llvm::Instruction::FPTrunc: - case llvm::Instruction::BitCast: - case llvm::Instruction::AddrSpaceCast: { - auto *LLVMCast = cast(LLVMV); - It->second = std::unique_ptr(new CastInst(LLVMCast, *this)); - return It->second.get(); - } - case llvm::Instruction::PHI: { - auto *LLVMPhi = cast(LLVMV); - It->second = std::unique_ptr(new PHINode(LLVMPhi, *this)); - return It->second.get(); - } - case llvm::Instruction::ICmp: { - auto *LLVMICmp = cast(LLVMV); - It->second = std::unique_ptr(new ICmpInst(LLVMICmp, *this)); - return It->second.get(); - } - case llvm::Instruction::FCmp: { - auto *LLVMFCmp = cast(LLVMV); - It->second = std::unique_ptr(new FCmpInst(LLVMFCmp, *this)); - return It->second.get(); - } - case llvm::Instruction::Unreachable: { - auto *LLVMUnreachable = cast(LLVMV); - It->second = std::unique_ptr( - new UnreachableInst(LLVMUnreachable, *this)); - return It->second.get(); - } - default: - break; - } - - It->second = std::unique_ptr( - new OpaqueInst(cast(LLVMV), *this)); - return It->second.get(); + llvm_unreachable("Unhandled LLVMV type!"); } Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) { diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index b977b6aaaf619..30d9372e4afd1 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1059,7 +1059,6 @@ def ProcessorFeatures { FeatureJS, FeatureLSE, FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureCCIDX, FeatureRDM]; - list Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, FeatureRandGen, FeaturePAuth, FeatureSM4, FeatureSHA2, @@ -1068,6 +1067,7 @@ def ProcessorFeatures { FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8, FeatureSSBS, FeatureCCIDX, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; + list Grace = !listconcat(NeoverseV2, [FeatureSVE2SM4, FeatureSVEAES, FeatureSVE2SHA3]); // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1151,6 +1151,8 @@ def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4, [TuneX4]>; def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925, [TuneX925]>; +def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace, + [TuneNeoverseV2]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; def : ProcessorModel<"neoverse-n1", NeoverseN1Model, @@ -1166,7 +1168,6 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model, ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; def : ProcessorModel<"neoverse-v2", NeoverseV2Model, ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>; -def : ProcessorAlias<"grace", "neoverse-v2">; def : ProcessorModel<"neoverse-v3", NeoverseV2Model, ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>; def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 6439149d801f6..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", @@ -1619,28 +1613,6 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureAtomicBufferPkAddBF16Inst ])>; -def FeatureISAVersion9_4_0 : FeatureSet< - !listconcat(FeatureISAVersion9_4_Common.Features, - [ - FeatureAddressableLocalMemorySize65536, - FeatureForceStoreSC0SC1, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, - FeatureXF32Insts - ])>; - -def FeatureISAVersion9_4_1 : FeatureSet< - !listconcat(FeatureISAVersion9_4_Common.Features, - [ - FeatureAddressableLocalMemorySize65536, - FeatureForceStoreSC0SC1, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, - FeatureXF32Insts - ])>; - def FeatureISAVersion9_4_2 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3bbbbcf71d8ae..cf3843869808b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4295,7 +4295,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( // TODO: Handle G_FSUB 0 as fneg // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. - (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() + (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 649deee346e90..4ce8ffb39599b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2710,7 +2710,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, if (IsIEEEOp) return true; - return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; + return Helper.lowerFMinNumMaxNum(MI, !ST.hasIEEEMinNumMaxNum()) == + LegalizerHelper::Legalized; } bool AMDGPULegalizerInfo::legalizeExtractVectorElt( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index c0581e491720d..3159b497a1ecb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -192,8 +192,7 @@ class AMDGPUSwLowerLDS { void getLDSMemoryInstructions(Function *Func, SetVector &LDSInstructions); void replaceKernelLDSAccesses(Function *Func); - Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, - Value *LDSPtr); + Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr); void translateLDSMemoryOperationsToGlobalMemory( Function *Func, Value *LoadMallocPtr, SetVector &LDSInstructions); @@ -655,20 +654,30 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions( } else if (AtomicCmpXchgInst *XCHG = dyn_cast(&Inst)) { if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) LDSInstructions.insert(&Inst); + } else if (AddrSpaceCastInst *ASC = dyn_cast(&Inst)) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) + LDSInstructions.insert(&Inst); } else continue; } } } -Value * -AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, +Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr) { assert(LDSPtr && "Invalid LDS pointer operand"); - Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty()); - Value *GEP = - IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt}); - return GEP; + Type *LDSPtrType = LDSPtr->getType(); + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + if (auto *VecPtrTy = dyn_cast(LDSPtrType)) { + // Handle vector of pointers + ElementCount NumElements = VecPtrTy->getElementCount(); + IntTy = VectorType::get(IntTy, NumElements); + } + Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy); + return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex}); } void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( @@ -681,7 +690,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( if (LoadInst *LI = dyn_cast(Inst)) { Value *LIOperand = LI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand); LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement, LI->getAlign(), LI->isVolatile()); NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); @@ -691,7 +700,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (StoreInst *SI = dyn_cast(Inst)) { Value *SIOperand = SI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand); StoreInst *NewSI = IRB.CreateAlignedStore( SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); @@ -701,8 +710,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { Value *RMWPtrOperand = RMW->getPointerOperand(); Value *RMWValOperand = RMW->getValOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, RMWPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand); AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW( RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(), RMW->getOrdering(), RMW->getSyncScopeID()); @@ -712,8 +721,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( RMW->eraseFromParent(); } else if (AtomicCmpXchgInst *XCHG = dyn_cast(Inst)) { Value *XCHGPtrOperand = XCHG->getPointerOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, XCHGPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand); AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg( Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(), XCHG->getAlign(), XCHG->getSuccessOrdering(), @@ -722,6 +731,16 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( AsanInfo.Instructions.insert(NewXCHG); XCHG->replaceAllUsesWith(NewXCHG); XCHG->eraseFromParent(); + } else if (AddrSpaceCastInst *ASC = dyn_cast(Inst)) { + Value *AIOperand = ASC->getPointerOperand(); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand); + Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType()); + // Note: No need to add the instruction to AsanInfo instructions to be + // instrumented list. FLAT_ADDRESS ptr would have been already + // instrumented by asan pass prior to this pass. + ASC->replaceAllUsesWith(NewAI); + ASC->eraseFromParent(); } else report_fatal_error("Unimplemented LDS lowering instruction"); } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 9ca853befba73..d3487daee364f 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1773,7 +1773,7 @@ def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; -// GFX940+. +// GFX942+. def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index ea6e703eba5d9..7988a9ac0ce55 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -814,7 +814,7 @@ defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", } // End SubtargetPredicate = isGFX7GFX10GFX11 -// GFX940-, GFX11-only flat instructions. +// GFX942-, GFX11-only flat instructions. let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; } // End SubtargetPredicate = HasFlatAtomicFaddF32Inst @@ -2076,7 +2076,7 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x1f>; let SubtargetPredicate = isGFX8GFX9NotGFX940 in { - // These instructions are encoded differently on gfx90* and gfx940. + // These instructions are encoded differently on gfx90* and gfx94*. defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 827598078af53..1ff75095b220a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2292,7 +2292,7 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 @@ -2600,7 +2600,7 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 @@ -2610,7 +2610,7 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index a86c76bb6075e..0b372e29efe67 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -192,15 +192,7 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel, FeatureISAVersion9_0_C.Features >; -def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel, - FeatureISAVersion9_4_0.Features ->; - -def : ProcessorModel<"gfx941", SIDPGFX940FullSpeedModel, - FeatureISAVersion9_4_1.Features ->; - -def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, +def : ProcessorModel<"gfx942", SIDPGFX942FullSpeedModel, FeatureISAVersion9_4_2.Features >; @@ -213,8 +205,8 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel, FeatureISAVersion9_Generic.Features >; -// [gfx940, gfx941, gfx942] -def : ProcessorModel<"gfx9-4-generic", SIDPGFX940FullSpeedModel, +// [gfx942] +def : ProcessorModel<"gfx9-4-generic", SIDPGFX942FullSpeedModel, FeatureISAVersion9_4_Generic.Features >; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index b5e8e246825c7..55af5826e90d0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -422,10 +422,10 @@ unsigned GCNSubtarget::getBaseMaxNumSGPRs( // Check if maximum number of SGPRs was explicitly requested using // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + if (Requested != MaxNumSGPRs) { // Make sure requested value does not violate subtarget's specifications. if (Requested && (Requested <= ReservedNumSGPRs)) Requested = 0; @@ -504,10 +504,9 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs( // Check if maximum number of VGPRs was explicitly requested using // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); - + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); + if (Requested != MaxNumVGPRs) { if (hasGFX90AInsts()) Requested *= 2; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 342b211199dca..56162d18e039d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } @@ -1297,11 +1294,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPackedTID() const { return HasPackedTID; } - // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that + // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that // hasGFX90AInsts is also true. bool hasGFX940Insts() const { return GFX940Insts; } - // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that + // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that // hasGFX940Insts and hasGFX90AInsts are also true. bool hasGFX950Insts() const { return GFX950Insts; } @@ -1431,6 +1428,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // \returns true if the target has IEEE fminimum/fmaximum instructions bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } + // \returns true if the target has IEEE fminimum_num/fmaximum_num + // instructions + bool hasIEEEMinNumMaxNum() const { return getGeneration() >= GFX12; } + // \returns true if the target has IEEE fminimum3/fmaximum3 instructions bool hasIEEEMinMax3() const { return hasIEEEMinMax(); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 059bab5838526..4a4ad712e304d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -93,8 +93,6 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; @@ -180,8 +178,6 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; - case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; - case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941; case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942; case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index f812ae652b63d..721601efcc804 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -542,7 +542,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_EXCP_FLAG_USER = 18, ID_TRAP_CTRL = 19, - // GFX940 specific registers + // GFX94* specific registers ID_XCC_ID = 20, ID_SQ_PERF_SNAPSHOT_DATA = 21, ID_SQ_PERF_SNAPSHOT_DATA1 = 22, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index ab396929162d0..fa15e73bc31d5 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -203,6 +203,8 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::V_FMA_F32_e64; case AMDGPU::V_FMAC_F16_e64: return AMDGPU::V_FMA_F16_gfx9_e64; + case AMDGPU::V_FMAC_F16_t16_e64: + return AMDGPU::V_FMA_F16_gfx9_t16_e64; case AMDGPU::V_FMAC_F16_fake16_e64: return AMDGPU::V_FMA_F16_gfx9_fake16_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e09b310d107ac..a60345bfedca9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6833,7 +6833,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, // mode functions, but this happens to be OK since it's only done in cases // where there is known no sNaN. if (IsIEEEMode) - return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); + return expandFMINNUM_FMAXNUM(Op.getNode(), DAG, + !Subtarget->hasIEEEMinNumMaxNum()); if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || VT == MVT::v16bf16) @@ -16823,39 +16824,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // safe. The message phrasing also should be better. if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { if (AS == AMDGPUAS::FLAT_ADDRESS) { - // gfx940, gfx12 + // gfx942, gfx12 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { - // gfx90a, gfx940, gfx12 + // gfx90a, gfx942, gfx12 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // gfx940, gfx12 + // gfx942, gfx12 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // gfx90a, gfx940, gfx12 + // gfx90a, gfx942, gfx12 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for + // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for // buffer. gfx12 does have the buffer version. if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } - // global and flat atomic fadd f64: gfx90a, gfx940. + // global and flat atomic fadd f64: gfx90a, gfx942. if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) return ReportUnsafeHWInst(AtomicExpansionKind::None); if (AS != AMDGPUAS::FLAT_ADDRESS) { if (Ty->isFloatTy()) { - // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, + // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942, // gfx11+. if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+. if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else { @@ -16867,7 +16868,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } } - // flat atomic fadd f32: gfx940, gfx11+. + // flat atomic fadd f32: gfx942, gfx11+. if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { if (Subtarget->hasFlatAtomicFaddF32Inst()) return ReportUnsafeHWInst(AtomicExpansionKind::None); @@ -16906,7 +16907,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // float, double restored in gfx10. // double removed again in gfx11, so only f32 for gfx11/gfx12. // - // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but + // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but // no f32. if (AS == AMDGPUAS::FLAT_ADDRESS) { if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ceab6c9dcca34..2cf6de73fa90c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3461,6 +3461,62 @@ std::optional SIInstrInfo::extractSubregFromImm(int64_t Imm, llvm_unreachable("covered subregister switch"); } +static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAD_F16_e64: + return AMDGPU::V_MADAK_F16; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAD_F32_e64: + return AMDGPU::V_MADAK_F32; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMA_F32_e64: + return AMDGPU::V_FMAAK_F32; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMA_F16_e64: + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMAAK_F16_t16 + : AMDGPU::V_FMAAK_F16_fake16 + : AMDGPU::V_FMAAK_F16; + default: + llvm_unreachable("invalid instruction"); + } +} + +static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAD_F16_e64: + return AMDGPU::V_MADMK_F16; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAD_F32_e64: + return AMDGPU::V_MADMK_F32; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMA_F32_e64: + return AMDGPU::V_FMAMK_F32; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMA_F16_e64: + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMAMK_F16_t16 + : AMDGPU::V_FMAMK_F16_fake16 + : AMDGPU::V_FMAMK_F16; + default: + llvm_unreachable("invalid instruction"); + } +} + bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) @@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. @@ -3550,12 +3607,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src0 = &UseMI.getOperand(Src0Idx); - bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; - bool IsFMA = - Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -3586,18 +3637,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, !isInlineConstant(Def->getOperand(1))) return false; - unsigned NewOpc = - IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16) - : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); + unsigned NewOpc = getNewFMAMKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite - // would also require restricting their register classes. For now - // just bail out. - if (NewOpc == AMDGPU::V_FMAMK_F16_fake16) + // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16 + // takes VGPR_32_Lo128 operands, so the rewrite would also require + // restricting their register classes. For now just bail out. + if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || + NewOpc == AMDGPU::V_FMAMK_F16_fake16) return false; const std::optional SubRegImm = extractSubregFromImm( @@ -3613,7 +3661,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setIsKill(RegSrc->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3666,25 +3714,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } } - unsigned NewOpc = - IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16) - : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); + unsigned NewOpc = getNewFMAAKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite - // would also require restricting their register classes. For now - // just bail out. - if (NewOpc == AMDGPU::V_FMAAK_F16_fake16) + // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16 + // takes VGPR_32_Lo128 operands, so the rewrite would also require + // restricting their register classes. For now just bail out. + if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || + NewOpc == AMDGPU::V_FMAAK_F16_fake16) return false; // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3874,8 +3919,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { return AMDGPU::V_FMA_LEGACY_F32_e64; case AMDGPU::V_FMAC_F16_e32: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: - return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64 + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMA_F16_gfx9_t16_e64 + : AMDGPU::V_FMA_F16_gfx9_fake16_e64 : AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_F32_e32: case AMDGPU::V_FMAC_F32_e64: @@ -3941,21 +3989,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } - assert( - Opc != AMDGPU::V_FMAC_F16_fake16_e32 && - "V_FMAC_F16_fake16_e32 is not supported and not expected to be present " - "pre-RA"); + assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 && + Opc != AMDGPU::V_FMAC_F16_fake16_e32 && + "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be " + "present pre-RA"); // Handle MAC/FMAC. - bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64; - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || - Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || - Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || @@ -3968,6 +4007,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return nullptr; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: @@ -4052,11 +4092,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { - unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16) - : AMDGPU::V_FMAAK_F32) - : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); + unsigned NewOpc = getNewFMAAKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -4071,11 +4107,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } } - unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16) - : AMDGPU::V_FMAMK_F32) - : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); + unsigned NewOpc = getNewFMAMKInst(ST, Opc); if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) @@ -4513,6 +4545,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_FMAC_F32_e64: case AMDGPU::V_FMAC_F64_e64: @@ -5569,7 +5602,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; - case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64; + case AMDGPU::S_FMAC_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64 + : AMDGPU::V_FMAC_F16_fake16_e64; case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; @@ -5931,11 +5966,15 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); - const MachineOperand *UsedLiteral = nullptr; + const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo); + + if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) { + const MachineOperand *UsedLiteral = nullptr; - int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); - int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; - if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; + + // TODO: Be more permissive with frame indexes. if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { if (!LiteralLimit--) return false; @@ -5974,9 +6013,19 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return false; } } - } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() && - isF16PseudoScalarTrans(MI.getOpcode()) && - isInlineConstant(*MO, OpInfo)) { + } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) { + // There can be at most one literal operand, but it can be repeated. + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + const MachineOperand &Op = MI.getOperand(i); + if (!Op.isReg() && !Op.isFI() && + !isInlineConstant(Op, InstDesc.operands()[i]) && + !Op.isIdenticalTo(*MO)) + return false; + } + } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() && + isF16PseudoScalarTrans(MI.getOpcode())) { return false; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6e08aff24ec23..3faf0795157dc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3287,6 +3287,14 @@ def : GCNPat < (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; +let True16Predicate = UseRealTrue16Insts in +def : GCNPat < + (fma (f16 (VOP3NoMods f16:$src0)), + (f16 (VOP3NoMods f16:$src1)), + (f16 (VOP3NoMods f16:$src2))), + (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2) +>; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (fma (f16 (VOP3NoMods f16:$src0)), diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index be6cff873532b..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { - return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -492,7 +487,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { } public: - SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, @@ -518,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { - bool Changed = false; - if (ST.hasForceStoreSC0SC1() && - (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | - SIAtomicAddrSpace::GLOBAL | - SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - } - return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2821,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 117add324db56..2a374b360b04a 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -94,7 +94,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; -def SIDPGFX940FullSpeedModel : SISchedMachineModel; +def SIDPGFX942FullSpeedModel : SISchedMachineModel; def SIDPGFX950FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; def GFX11SpeedModel : SISchedMachineModel; @@ -276,7 +276,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; } // End SchedModel = SIDPFullSpeedModel -let SchedModel = SIDPGFX940FullSpeedModel in { +let SchedModel = SIDPGFX942FullSpeedModel in { defm : SICommonWriteRes; @@ -308,7 +308,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; -} // End SchedModel = SIDPGFX940FullSpeedModel +} // End SchedModel = SIDPGFX942FullSpeedModel let SchedModel = SIDPGFX950FullSpeedModel in { diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 979812e07fc3f..f03cde455f295 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + NewOpcode = AMDGPU::V_FMAAK_F16; + break; + case AMDGPU::V_FMA_F16_gfx9_t16_e64: + NewOpcode = AMDGPU::V_FMAAK_F16_t16; + break; case AMDGPU::V_FMA_F16_gfx9_fake16_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16; + NewOpcode = AMDGPU::V_FMAAK_F16_fake16; break; } } @@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + NewOpcode = AMDGPU::V_FMAMK_F16; + break; + case AMDGPU::V_FMA_F16_gfx9_t16_e64: + NewOpcode = AMDGPU::V_FMAMK_F16_t16; + break; case AMDGPU::V_FMA_F16_gfx9_fake16_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16; + NewOpcode = AMDGPU::V_FMAMK_F16_fake16; break; } } @@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { shrinkMadFma(MI); continue; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index a8e4ce133ffbc..e433b85489e6e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -216,7 +216,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, - // GFX940 specific registers + // GFX942 specific registers {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940}, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 21898da1912f5..d5c6e8af109f4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1151,7 +1151,7 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile, any_fadd>; } // End SubtargetPredicate = HasPackedFP32Ops - let SubtargetPredicate = HasPkMovB32 in + let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile>; } // End isCommutable = 1, isReMaterializable = 1 diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index eb1491feb611e..c7ed73d0e95f7 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -767,6 +767,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } + } else if (!Subtarget->isTargetMachO()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); } if (Subtarget->isThumb1Only()) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index fd390cdbf9057..8702f0eecf2aa 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -47,7 +47,7 @@ static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, return reportError(Ctx, "Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - RSD.Flags = Flag->getZExtValue(); + RSD.Header.Flags = Flag->getZExtValue(); return false; } @@ -93,7 +93,7 @@ static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, } static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { - if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Flags)) { + if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Header.Flags)) { return reportError(Ctx, "Invalid Root Signature flag value"); } return false; @@ -201,15 +201,14 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, // start root signature header Space++; - OS << indent(Space) << "Flags: " << format_hex(RS.Flags, 8) << ":\n"; - OS << indent(Space) << "Version: " << RS.Version << ":\n"; - OS << indent(Space) << "NumParameters: " << RS.NumParameters << ":\n"; - OS << indent(Space) << "RootParametersOffset: " << RS.RootParametersOffset - << ":\n"; - OS << indent(Space) << "NumStaticSamplers: " << RS.NumStaticSamplers - << ":\n"; - OS << indent(Space) << "StaticSamplersOffset: " << RS.StaticSamplersOffset - << ":\n"; + OS << indent(Space) << "Flags: " << format_hex(RS.Header.Flags, 8) << ":\n"; + OS << indent(Space) << "Version: " << RS.Header.Version << ":\n"; + OS << indent(Space) << "NumParameters: " << RS.Parameters.size() << ":\n"; + OS << indent(Space) + << "RootParametersOffset: " << RS.Parameters.size_in_bytes() << ":\n"; + OS << indent(Space) << "NumStaticSamplers: " << 0 << ":\n"; + OS << indent(Space) << "StaticSamplersOffset: " + << sizeof(RS.Header) + RS.Parameters.size_in_bytes() << ":\n"; Space--; // end root signature header } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index d66e3e306d2ff..1710488e4e292 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1886,11 +1886,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf"); else setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf"); - - // Routines to handle fp16 storage type. - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - setLibcallName(RTLIB::FPROUND_F64_F16, "__truncdfhf2"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); } const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f20502521829e..ed7963f35a7c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7704,6 +7704,48 @@ defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; +multiclass TCGEN05_SHIFT_INTR { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr), + !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"), + [(Intr Int32Regs:$tmem_addr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; +defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; + +multiclass TCGEN05_CP_INTR { + defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16"); + defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret; + defvar fmt_intr = StrJoin<"_", [src_fmt]>.ret; + + defvar shape_mc_asm = StrJoin<".", [shape, mc]>.ret; + defvar shape_mc_intr = !subst("::", "_", !subst(".", "_", shape_mc_asm)); + + defvar intr_prefix = StrJoin<"_", ["int_nvvm_tcgen05_cp", shape_mc_intr, fmt_intr]>.ret; + defvar IntrCG1 = !cast(intr_prefix # "_cg1"); + defvar IntrCG2 = !cast(intr_prefix # "_cg2"); + + def NAME # _cg1 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; + def NAME # _cg2 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; +} + +foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + defm TCGEN05_CP_128x256b # src_fmt : TCGEN05_CP_INTR<"128x256b", src_fmt>; + defm TCGEN05_CP_4x256b # src_fmt : TCGEN05_CP_INTR<"4x256b", src_fmt>; + defm TCGEN05_CP_128x128b # src_fmt : TCGEN05_CP_INTR<"128x128b", src_fmt>; + defm TCGEN05_CP_64x128_1 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::02_13">; + defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">; + defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">; +} } // isConvergent let hasSideEffects = 1 in { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 4720928f472b3..d6c8e8d506799 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -197,11 +197,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } setOperationAction(ISD::UADDO, RegVT, Custom); - setOperationAction(ISD::USUBO, RegVT, Custom); - - // PowerPC uses addo_carry,subo_carry to propagate carry. - setOperationAction(ISD::UADDO_CARRY, RegVT, Custom); - setOperationAction(ISD::USUBO_CARRY, RegVT, Custom); // On P10, the default lowering generates better code using the // setbc instruction. @@ -265,6 +260,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); } + // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Legal); + setOperationAction(ISD::ADDE, VT, Legal); + setOperationAction(ISD::SUBC, VT, Legal); + setOperationAction(ISD::SUBE, VT, Legal); + } + if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -1850,14 +1854,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::SETBC"; case PPCISD::SETBCR: return "PPCISD::SETBCR"; - case PPCISD::ADDC: - return "PPCISD::ADDC"; - case PPCISD::ADDE: - return "PPCISD::ADDE"; - case PPCISD::SUBC: - return "PPCISD::SUBC"; - case PPCISD::SUBE: - return "PPCISD::SUBE"; } return nullptr; } @@ -12017,74 +12013,43 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("ERROR:Should return for all cases within swtich."); } -static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value, - SelectionDAG &DAG, - const PPCSubtarget &STI) { - SDLoc DL(Value); - if (STI.useCRBits()) - Value = DAG.getNode(ISD::SELECT, DL, SumType, Value, - DAG.getConstant(1, DL, SumType), - DAG.getConstant(0, DL, SumType)); - else - Value = DAG.getZExtOrTrunc(Value, DL, SumType); - SDValue Sum = DAG.getNode(PPCISD::ADDC, DL, DAG.getVTList(SumType, MVT::i32), - Value, DAG.getAllOnesConstant(DL, SumType)); - return Sum.getValue(1); -} +SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const { + // Default to target independent lowering if there is a logical user of the + // carry-bit. + for (SDNode *U : Op->users()) { + if (U->getOpcode() == ISD::SELECT) + return SDValue(); + if (ISD::isBitwiseLogicOp(U->getOpcode())) { + for (unsigned i = 0, ie = U->getNumOperands(); i != ie; ++i) { + if (U->getOperand(i).getOpcode() != ISD::UADDO && + U->getOperand(i).getOpcode() != ISD::MERGE_VALUES) + return SDValue(); + } + } + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDLoc dl(Op); -static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag, - EVT CarryType, SelectionDAG &DAG, - const PPCSubtarget &STI) { - SDLoc DL(Flag); - SDValue Zero = DAG.getConstant(0, DL, SumType); - SDValue Carry = DAG.getNode( - PPCISD::ADDE, DL, DAG.getVTList(SumType, MVT::i32), Zero, Zero, Flag); - if (STI.useCRBits()) - return DAG.getSetCC(DL, CarryType, Carry, Zero, ISD::SETNE); - return DAG.getZExtOrTrunc(Carry, DL, CarryType); -} + // Default to target independent lowering for special cases handled there. + if (isOneConstant(RHS) || isAllOnesConstant(RHS)) + return SDValue(); -SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getNode()->getValueType(0); - SDLoc DL(Op); - SDNode *N = Op.getNode(); - EVT VT = N->getValueType(0); - EVT CarryType = N->getValueType(1); - unsigned Opc = N->getOpcode(); - bool IsAdd = Opc == ISD::UADDO; - Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC; - SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), - N->getOperand(0), N->getOperand(1)); - SDValue Carry = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, - DAG, Subtarget); - if (!IsAdd) - Carry = DAG.getNode(ISD::XOR, DL, CarryType, Carry, - DAG.getAllOnesConstant(DL, CarryType)); - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, Carry); -} - -SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDNode *N = Op.getNode(); - unsigned Opc = N->getOpcode(); - EVT VT = N->getValueType(0); - EVT CarryType = N->getValueType(1); - SDValue CarryOp = N->getOperand(2); - bool IsAdd = Opc == ISD::UADDO_CARRY; - Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE; - if (!IsAdd) - CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, - DAG.getAllOnesConstant(DL, CarryOp.getValueType())); - CarryOp = ConvertCarryValueToCarryFlag(VT, CarryOp, DAG, Subtarget); - SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), - Op.getOperand(0), Op.getOperand(1), CarryOp); - CarryOp = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, DAG, - Subtarget); - if (!IsAdd) - CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, - DAG.getAllOnesConstant(DL, CarryOp.getValueType())); - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, CarryOp); + SDValue ADDC; + SDValue Overflow; + SDVTList VTs = Op.getNode()->getVTList(); + + ADDC = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), LHS, RHS); + Overflow = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(VT, MVT::Glue), + DAG.getConstant(0, dl, VT), DAG.getConstant(0, dl, VT), + ADDC.getValue(1)); + SDValue OverflowTrunc = + DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow); + SDValue Res = + DAG.getNode(ISD::MERGE_VALUES, dl, VTs, ADDC.getValue(0), OverflowTrunc); + return Res; } SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { @@ -12115,8 +12080,8 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - default: - llvm_unreachable("Wasn't expecting to be able to lower this!"); + default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::UADDO: return LowerUaddo(Op, DAG); case ISD::FPOW: return lowerPow(Op, DAG); case ISD::FSIN: return lowerSin(Op, DAG); case ISD::FCOS: return lowerCos(Op, DAG); @@ -12209,12 +12174,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerATOMIC_LOAD_STORE(Op, DAG); case ISD::IS_FPCLASS: return LowerIS_FPCLASS(Op, DAG); - case ISD::UADDO: - case ISD::USUBO: - return LowerADDSUBO(Op, DAG); - case ISD::UADDO_CARRY: - case ISD::USUBO_CARRY: - return LowerADDSUBO_CARRY(Op, DAG); } } @@ -16150,21 +16109,6 @@ static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) { return true; } -static SDValue DAGCombineAddc(SDNode *N, - llvm::PPCTargetLowering::DAGCombinerInfo &DCI) { - if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(1)) { - // (ADDC (ADDE 0, 0, C), -1) -> C - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS->getOpcode() == PPCISD::ADDE && - isNullConstant(LHS->getOperand(0)) && - isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) { - return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); - } - } - return SDValue(); -} - SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -16953,8 +16897,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); - case PPCISD::ADDC: - return DAGCombineAddc(N, DCI); } return SDValue(); @@ -17008,16 +16950,6 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero = 0xFFFF0000; break; } - case PPCISD::ADDE: { - if (Op.getResNo() == 0) { - // (0|1), _ = ADDE 0, 0, CARRY - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - if (isNullConstant(LHS) && isNullConstant(RHS)) - Known.Zero = ~1ULL; - } - break; - } case ISD::INTRINSIC_WO_CHAIN: { switch (Op.getConstantOperandVal(0)) { default: break; @@ -18287,8 +18219,7 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, return SDValue(); SDLoc DL(N); - EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - SDVTList VTs = DAG.getVTList(MVT::i64, CarryType); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue); SDValue Cmp = RHS.getOperand(0); SDValue Z = Cmp.getOperand(0); auto *Constant = cast(Cmp.getOperand(1)); @@ -18306,14 +18237,11 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Addc = - DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64), - DAG.getConstant(0, DL, CarryType)); - return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, - DAG.getConstant(0, DL, MVT::i64), + SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64)); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), SDValue(Addc.getNode(), 1)); - } + } case ISD::SETEQ: { // when C == 0 // --> addze X, (subfic Z, 0).carry @@ -18324,15 +18252,11 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Subc = - DAG.getNode(ISD::USUBO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - DAG.getConstant(0, DL, MVT::i64), AddOrZ, - DAG.getConstant(0, DL, CarryType)); - SDValue Invert = DAG.getNode(ISD::XOR, DL, CarryType, Subc.getValue(1), - DAG.getAllOnesConstant(DL, CarryType)); - return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, - DAG.getConstant(0, DL, MVT::i64), Invert); - } + SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + DAG.getConstant(0, DL, MVT::i64), AddOrZ); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), + SDValue(Subc.getNode(), 1)); + } } return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 2d86a224b54c1..514329bbe92d7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -161,12 +161,6 @@ namespace llvm { SRA, SHL, - /// These nodes represent PPC arithmetic operations with carry. - ADDC, - ADDE, - SUBC, - SUBE, - /// FNMSUB - Negated multiply-subtract instruction. FNMSUB, @@ -1286,6 +1280,7 @@ namespace llvm { SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUaddo(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; @@ -1321,8 +1316,6 @@ namespace llvm { SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerToLibCall(const char *LibCallName, SDValue Op, SelectionDAG &DAG) const; SDValue lowerLibCallBasedOnType(const char *LibCallFloatName, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 4205b3086a3c9..bcac0de55d9d3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -760,13 +760,13 @@ def STFDXTLS : XForm_8<31, 727, (outs), (ins f8rc:$RST, ptr_rc_nor0:$RA, tlsreg: let isCommutable = 1 in defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCaddc i64:$RA, i64:$RB))]>, + [(set i64:$RT, (addc i64:$RA, i64:$RB))]>, PPC970_DGroup_Cracked; let Defs = [CARRY] in def ADDIC8 : DForm_2<12, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (PPCaddc i64:$RA, imm64SExt16:$D))]>; + [(set i64:$RST, (addc i64:$RA, imm64SExt16:$D))]>; def ADDI8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), "addi $RST, $RA, $D", IIC_IntSimple, [(set i64:$RST, (add i64:$RA, imm64SExt16:$D))]>; @@ -782,11 +782,11 @@ def LA8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), let Defs = [CARRY] in { def SUBFIC8: DForm_2< 8, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (PPCsubc imm64SExt16:$D, i64:$RA))]>; + [(set i64:$RST, (subc imm64SExt16:$D, i64:$RA))]>; } defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCsubc i64:$RB, i64:$RA))]>, + [(set i64:$RT, (subc i64:$RB, i64:$RA))]>, PPC970_DGroup_Cracked; defm SUBF8 : XOForm_1rx<31, 40, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subf", "$RT, $RA, $RB", IIC_IntGeneral, @@ -798,22 +798,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, i64:$RB, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, i64:$RB))]>; defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, -1, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, -1))]>; defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, 0, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, 0))]>; defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCsube i64:$RB, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube i64:$RB, i64:$RA))]>; defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCsube -1, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube -1, i64:$RA))]>; defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCsube 0, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube 0, i64:$RA))]>; } } // isCodeGenOnly diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 6e0640fa715ea..3aef6f2c893fa 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1758,23 +1758,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; - } else if ((PPC::G8RCRegClass.contains(DestReg) || - PPC::GPRCRegClass.contains(DestReg)) && - SrcReg == PPC::CARRY) { - bool Is64Bit = PPC::G8RCRegClass.contains(DestReg); - BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MFSPR8 : PPC::MFSPR), DestReg) - .addImm(1) - .addReg(PPC::CARRY, RegState::Implicit); - return; - } else if ((PPC::G8RCRegClass.contains(SrcReg) || - PPC::GPRCRegClass.contains(SrcReg)) && - DestReg == PPC::CARRY) { - bool Is64Bit = PPC::G8RCRegClass.contains(SrcReg); - BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MTSPR8 : PPC::MTSPR)) - .addImm(1) - .addReg(SrcReg) - .addReg(PPC::CARRY, RegState::ImplicitDefine); - return; } unsigned Opc; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index b5ed5d55da4c7..be90a5c562c57 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -124,21 +124,6 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> ]>; -// RES, CARRY = op LHS, RHS -def SDT_PPCBinaryArithWithFlagsOut : SDTypeProfile<2, 2, [ - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisVT<1, i32>, -]>; - -// RES, CARRY = op LHS, RHS, CARRY -def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [ - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisSameAs<1, 4>, - SDTCisVT<1, i32>, -]>; - //===----------------------------------------------------------------------===// // PowerPC specific DAG Nodes. // @@ -416,15 +401,6 @@ def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR", def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR", SDTIntUnaryOp, []>; -def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut, - [SDNPCommutative]>; -def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut, - []>; -def PPCsubc : SDNode<"PPCISD::SUBC", SDT_PPCBinaryArithWithFlagsOut, - []>; -def PPCsube : SDNode<"PPCISD::SUBE", SDT_PPCBinaryArithWithFlagsInOut, - []>; - //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. // @@ -2315,7 +2291,7 @@ let BaseName = "addic" in { let Defs = [CARRY] in def ADDIC : DForm_2<12, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (PPCaddc i32:$RA, imm32SExt16:$D))]>, + [(set i32:$RST, (addc i32:$RA, imm32SExt16:$D))]>, RecFormRel, PPC970_DGroup_Cracked; let Defs = [CARRY, CR0] in def ADDIC_rec : DForm_2<13, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), @@ -2336,7 +2312,7 @@ def MULLI : DForm_2< 7, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), let Defs = [CARRY] in def SUBFIC : DForm_2< 8, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (PPCsubc imm32SExt16:$D, i32:$RA))]>; + [(set i32:$RST, (subc imm32SExt16:$D, i32:$RA))]>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def LI : DForm_2_r0<14, (outs gprc:$RST), (ins s16imm:$D), @@ -2933,7 +2909,7 @@ def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$RT), (ins gprc:$RA, tlsreg32:$RB let isCommutable = 1 in defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCaddc i32:$RA, i32:$RB))]>, + [(set i32:$RT, (addc i32:$RA, i32:$RB))]>, PPC970_DGroup_Cracked; defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), @@ -2966,7 +2942,7 @@ defm SUBF : XOForm_1rx<31, 40, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), [(set i32:$RT, (sub i32:$RB, i32:$RA))]>; defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCsubc i32:$RB, i32:$RA))]>, + [(set i32:$RT, (subc i32:$RB, i32:$RA))]>, PPC970_DGroup_Cracked; defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$RT), (ins gprc:$RA), "neg", "$RT, $RA", IIC_IntSimple, @@ -2975,22 +2951,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, i32:$RB, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, i32:$RB))]>; defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$RT), (ins gprc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, -1, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, -1))]>; defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$RT), (ins gprc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, 0, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, 0))]>; defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCsube i32:$RB, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube i32:$RB, i32:$RA))]>; defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$RT), (ins gprc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCsube -1, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube -1, i32:$RA))]>; defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$RT), (ins gprc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCsube 0, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube 0, i32:$RA))]>; } } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 2177dba1e5762..b60a91be82406 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -625,13 +625,6 @@ bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg, return BaseImplRetVal; } -const TargetRegisterClass * -PPCRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { - if (RC == &PPC::CARRYRCRegClass) - return TM.isPPC64() ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - return RC; -} - unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 21b6f7b13939a..274c7cb68ae0a 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -76,9 +76,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override; - const TargetRegisterClass * - getCrossCopyRegClass(const TargetRegisterClass *RC) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 8b690b7b833b3..3cb7cd9d8f229 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -494,7 +494,6 @@ def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> { def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> { let CopyCost = -1; - let isAllocatable = 0; } // Make AllocationOrder as similar as G8RC's to avoid potential spilling. diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 9b23a5ab521c8..5d1ea50eba494 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -37,6 +37,7 @@ add_llvm_target(RISCVCodeGen RISCVMakeCompressible.cpp RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp + RISCVFoldMemOffset.cpp RISCVFrameLowering.cpp RISCVGatherScatterLowering.cpp RISCVIndirectBranchTracking.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 851eea1352852..641e2eb4094f9 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -52,6 +52,9 @@ void initializeRISCVVectorPeepholePass(PassRegistry &); FunctionPass *createRISCVOptWInstrsPass(); void initializeRISCVOptWInstrsPass(PassRegistry &); +FunctionPass *createRISCVFoldMemOffsetPass(); +void initializeRISCVFoldMemOffsetPass(PassRegistry &); + FunctionPass *createRISCVMergeBaseOffsetOptPass(); void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp new file mode 100644 index 0000000000000..989e9d859d64f --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp @@ -0,0 +1,282 @@ +//===- RISCVFoldMemOffset.cpp - Fold ADDI into memory offsets ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// Look for ADDIs that can be removed by folding their immediate into later +// load/store addresses. There may be other arithmetic instructions between the +// addi and load/store that we need to reassociate through. If the final result +// of the arithmetic is only used by load/store addresses, we can fold the +// offset into the all the load/store as long as it doesn't create an offset +// that is too large. +// +//===---------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "riscv-fold-mem-offset" +#define RISCV_FOLD_MEM_OFFSET_NAME "RISC-V Fold Memory Offset" + +namespace { + +class RISCVFoldMemOffset : public MachineFunctionPass { +public: + static char ID; + + RISCVFoldMemOffset() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool foldOffset(Register OrigReg, int64_t InitialOffset, + const MachineRegisterInfo &MRI, + DenseMap &FoldableInstrs); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return RISCV_FOLD_MEM_OFFSET_NAME; } +}; + +// Wrapper class around a std::optional to allow accumulation. +class FoldableOffset { + std::optional Offset; + +public: + bool hasValue() const { return Offset.has_value(); } + int64_t getValue() const { return *Offset; } + + FoldableOffset &operator=(int64_t RHS) { + Offset = RHS; + return *this; + } + + FoldableOffset &operator+=(int64_t RHS) { + if (!Offset) + Offset = 0; + Offset = (uint64_t)*Offset + (uint64_t)RHS; + return *this; + } + + int64_t operator*() { return *Offset; } +}; + +} // end anonymous namespace + +char RISCVFoldMemOffset::ID = 0; +INITIALIZE_PASS(RISCVFoldMemOffset, DEBUG_TYPE, RISCV_FOLD_MEM_OFFSET_NAME, + false, false) + +FunctionPass *llvm::createRISCVFoldMemOffsetPass() { + return new RISCVFoldMemOffset(); +} + +// Walk forward from the ADDI looking for arithmetic instructions we can +// analyze or memory instructions that use it as part of their address +// calculation. For each arithmetic instruction we lookup how the offset +// contributes to the value in that register use that information to +// calculate the contribution to the output of this instruction. +// Only addition and left shift are supported. +// FIXME: Add multiplication by constant. The constant will be in a register. +bool RISCVFoldMemOffset::foldOffset( + Register OrigReg, int64_t InitialOffset, const MachineRegisterInfo &MRI, + DenseMap &FoldableInstrs) { + // Map to hold how much the offset contributes to the value of this register. + DenseMap RegToOffsetMap; + + // Insert root offset into the map. + RegToOffsetMap[OrigReg] = InitialOffset; + + std::queue Worklist; + Worklist.push(OrigReg); + + while (!Worklist.empty()) { + Register Reg = Worklist.front(); + Worklist.pop(); + + if (!Reg.isVirtual()) + return false; + + for (auto &User : MRI.use_nodbg_instructions(Reg)) { + FoldableOffset Offset; + + switch (User.getOpcode()) { + default: + return false; + case RISCV::ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = I->second; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH1ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 1; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH2ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 2; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH3ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 3; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::ADD_UW: + case RISCV::SH1ADD_UW: + case RISCV::SH2ADD_UW: + case RISCV::SH3ADD_UW: + // Don't fold through the zero extended input. + if (User.getOperand(1).getReg() == Reg) + return false; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset = I->second; + break; + case RISCV::SLLI: { + unsigned ShAmt = User.getOperand(2).getImm(); + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << ShAmt; + break; + } + case RISCV::LB: + case RISCV::LBU: + case RISCV::SB: + case RISCV::LH: + case RISCV::LH_INX: + case RISCV::LHU: + case RISCV::FLH: + case RISCV::SH: + case RISCV::SH_INX: + case RISCV::FSH: + case RISCV::LW: + case RISCV::LW_INX: + case RISCV::LWU: + case RISCV::FLW: + case RISCV::SW: + case RISCV::SW_INX: + case RISCV::FSW: + case RISCV::LD: + case RISCV::FLD: + case RISCV::SD: + case RISCV::FSD: { + // Can't fold into store value. + if (User.getOperand(0).getReg() == Reg) + return false; + + // Existing offset must be immediate. + if (!User.getOperand(2).isImm()) + return false; + + // Require at least one operation between the ADDI and the load/store. + // We have other optimizations that should handle the simple case. + if (User.getOperand(1).getReg() == OrigReg) + return false; + + auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + if (I == RegToOffsetMap.end()) + return false; + + int64_t LocalOffset = User.getOperand(2).getImm(); + assert(isInt<12>(LocalOffset)); + int64_t CombinedOffset = (uint64_t)LocalOffset + (uint64_t)I->second; + if (!isInt<12>(CombinedOffset)) + return false; + + FoldableInstrs[&User] = CombinedOffset; + continue; + } + } + + // If we reach here we should have an accumulated offset. + assert(Offset.hasValue() && "Expected an offset"); + + // If the offset is new or changed, add the destination register to the + // work list. + int64_t OffsetVal = Offset.getValue(); + auto P = + RegToOffsetMap.try_emplace(User.getOperand(0).getReg(), OffsetVal); + if (P.second) { + Worklist.push(User.getOperand(0).getReg()); + } else if (P.first->second != OffsetVal) { + P.first->second = OffsetVal; + Worklist.push(User.getOperand(0).getReg()); + } + } + } + + return true; +} + +bool RISCVFoldMemOffset::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + // This optimization may increase size by preventing compression. + if (MF.getFunction().hasOptSize()) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + // FIXME: We can support ADDIW from an LUI+ADDIW pair if the result is + // equivalent to LUI+ADDI. + if (MI.getOpcode() != RISCV::ADDI) + continue; + + // We only want to optimize register ADDIs. + if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) + continue; + + // Ignore 'li'. + if (MI.getOperand(1).getReg() == RISCV::X0) + continue; + + int64_t Offset = MI.getOperand(2).getImm(); + assert(isInt<12>(Offset)); + + DenseMap FoldableInstrs; + + if (!foldOffset(MI.getOperand(0).getReg(), Offset, MRI, FoldableInstrs)) + continue; + + if (FoldableInstrs.empty()) + continue; + + // We can fold this ADDI. + // Rewrite all the instructions. + for (auto [MemMI, NewOffset] : FoldableInstrs) + MemMI->getOperand(2).setImm(NewOffset); + + MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MI.eraseFromParent(); + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 98c25bc93a8a2..0f5e7bd254f68 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1549,9 +1549,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.useRVVForFixedLengthVectors()) setTargetDAGCombine(ISD::BITCAST); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Disable strict node mutation. IsStrictFPEnabled = true; EnableExtLdPromotion = true; @@ -19462,6 +19459,11 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.intersectWith(Known2); break; } + case RISCVISD::VCPOP_VL: { + KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); + Known.Zero.setBitsFrom(Known2.countMaxActiveBits()); + break; + } case RISCVISD::CZERO_EQZ: case RISCVISD::CZERO_NEZ: Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 167dbb53c5950..89e017807363b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -133,6 +133,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVPostRAExpandPseudoPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVOptWInstrsPass(*PR); + initializeRISCVFoldMemOffsetPass(*PR); initializeRISCVPreRAExpandPseudoPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVVectorPeepholePass(*PR); @@ -590,6 +591,7 @@ void RISCVPassConfig::addMachineSSAOptimization() { addPass(createRISCVVectorPeepholePass()); // TODO: Move this to pre regalloc addPass(createRISCVVMV0EliminationPass()); + addPass(createRISCVFoldMemOffsetPass()); TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index e5a98598370ec..66b989a84b1ce 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -1092,6 +1092,10 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VFWNMSAC_VF: case RISCV::VFWMACCBF16_VV: case RISCV::VFWMACCBF16_VF: + // Vector Floating-Point Square-Root Instruction + case RISCV::VFSQRT_V: + // Vector Floating-Point Reciprocal Square-Root Estimate Instruction + case RISCV::VFRSQRT7_V: // Vector Floating-Point MIN/MAX Instructions case RISCV::VFMIN_VF: case RISCV::VFMIN_VV: diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 3e9fc31d7bfc2..62854ea896179 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -525,7 +525,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToIntPairReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_IntReg); unsigned regIdx = 32; if (Reg >= Sparc::G0 && Reg <= Sparc::G7) @@ -544,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToDoubleReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_FloatReg); unsigned regIdx = Reg - Sparc::F0; if (regIdx % 2 || regIdx > 31) @@ -555,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToQuadReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = 0; switch (Op.Reg.Kind) { default: llvm_unreachable("Unexpected register kind!"); @@ -578,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToCoprocPairReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_CoprocReg); unsigned regIdx = 32; if (Reg >= Sparc::C0 && Reg <= Sparc::C31) @@ -592,7 +592,7 @@ class SparcOperand : public MCParsedAsmOperand { static std::unique_ptr MorphToMEMrr(unsigned Base, std::unique_ptr Op) { - unsigned offsetReg = Op->getReg(); + MCRegister offsetReg = Op->getReg(); Op->Kind = k_MemoryReg; Op->Mem.Base = Base; Op->Mem.OffsetReg = offsetReg; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp index 37503f4bc2ae2..f2a61c95fefb5 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp @@ -66,12 +66,12 @@ bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, return false; if (!MI->getOperand(0).isReg()) return false; - switch (MI->getOperand(0).getReg()) { + switch (MI->getOperand(0).getReg().id()) { default: return false; case SP::G0: // jmp $addr | ret | retl if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 8) { - switch(MI->getOperand(1).getReg()) { + switch (MI->getOperand(1).getReg().id()) { default: break; case SP::I7: O << "\tret"; return true; case SP::O7: O << "\tretl"; return true; @@ -115,7 +115,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, const MCOperand &MO = MI->getOperand (opNum); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + MCRegister Reg = MO.getReg(); if (isV9(STI)) printRegName(O, Reg, SP::RegNamesStateReg); else diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4fc79b3d6e3f8..da4ef677440fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -377,11 +377,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setMaxAtomicSizeInBitsSupported(64); - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is - // consistent with the f64 and f128 names. - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Define the emscripten name for return address helper. // TODO: when implementing other Wasm backends, make this generic or only do // this on emscripten depending on what they end up doing. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index b20a06b238c88..1fe0b1f2e0591 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -537,10 +537,6 @@ struct StaticLibcallNameMap { Map[NameLibcall.first] = NameLibcall.second; } } - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is - // consistent with the f64 and f128 names. - Map["__extendhfsf2"] = RTLIB::FPEXT_F16_F32; - Map["__truncsfhf2"] = RTLIB::FPROUND_F32_F16; Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS; } diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 72b103b0bb0c5..cf164acba9ec0 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -267,19 +267,19 @@ def RetCC_X86Common : CallingConv<[ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX target feature. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // Long double types are always returned in FP0 (even with SSE), @@ -565,7 +565,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -574,13 +574,13 @@ def CC_X86_64_C : CallingConv<[ // FIXME: This isn't precisely correct; the x86-64 ABI document says that // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. - CCIfNotVarArg>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>>, @@ -593,14 +593,14 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -631,13 +631,13 @@ def CC_X86_Win64_C : CallingConv<[ CCIfCFGuardTarget>, // 128 bit vectors are passed by pointer - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCPassIndirect>, // 256 bit vectors are passed by pointer - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect>, + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer - CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect>, + CCIfType<[v64i8, v32i16, v16i32, v32f16, v32bf16, v16f32, v8f64, v8i64], CCPassIndirect>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, @@ -734,15 +734,15 @@ def CC_X86_64_AnyReg : CallingConv<[ /// values are spilled on the stack. def CC_X86_32_Vector_Common : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -750,15 +750,15 @@ def CC_X86_32_Vector_Common : CallingConv<[ /// values are spilled on the stack. def CC_X86_Win32_Vector : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 4-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 4>>, // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 4>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 4>> ]>; @@ -766,16 +766,16 @@ def CC_X86_Win32_Vector : CallingConv<[ // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCIfIsVarArgOnWin>, @@ -786,16 +786,16 @@ def CC_X86_32_Vector_Standard : CallingConv<[ // vector registers. def CC_X86_32_Vector_Darwin : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 696bb14292dd0..d805a76754c71 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -736,9 +736,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); @@ -6113,6 +6110,19 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(N1); return true; } + case ISD::CONCAT_VECTORS: { + // Limit this to vXi64 vector cases to make the most of cross lane shuffles. + unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements(); + if (NumBitsPerElt == 64) { + for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) { + for (unsigned M = 0; M != NumSubElts; ++M) + Mask.push_back((I * NumElts) + M); + Ops.push_back(N.getOperand(I)); + } + return true; + } + return false; + } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); @@ -38927,13 +38937,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef Mask, } // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction. - // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || - (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()) || + (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) { unsigned MaxScale = 64 / MaskEltSize; bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize && DAG.ComputeNumSignBits(V1) == MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { + // Skip 512-bit VPMOV?XBW on non-AVX512BW targets. + if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs()) + continue; bool MatchAny = true; bool MatchZero = true; bool MatchSign = UseSign; @@ -39566,7 +39580,7 @@ static bool matchBinaryPermuteShuffle( static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableCrossLaneMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget); @@ -39581,7 +39595,7 @@ static SDValue combineX86ShuffleChainWithExtract( /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, @@ -40050,6 +40064,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (Depth < 1) return SDValue(); + bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) { + return isTargetShuffleVariableMask(N->getOpcode()); + }); + // Depth threshold above which we can efficiently use variable mask shuffles. int VariableCrossLaneShuffleDepth = Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2; @@ -40120,9 +40138,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, - Subtarget)) + Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input lane-crossing shuffle then lower to VPERMV3, @@ -40293,8 +40310,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) + Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input shuffle then lower to VPERMV3, @@ -40332,7 +40349,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // extract_subvector(shuffle(x,y,m2),0) static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableCrossLaneMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumMaskElts = BaseMask.size(); @@ -40461,7 +40478,7 @@ static SDValue combineX86ShuffleChainWithExtract( if (SDValue WideShuffle = combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, - HasVariableMask, AllowVariableCrossLaneMask, + SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) { WideShuffle = extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); @@ -40684,7 +40701,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( // TODO: Extend this to merge multiple constant Ops and update the mask. static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, ArrayRef Mask, - bool HasVariableMask, + ArrayRef SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { unsigned SizeInBits = VT.getSizeInBits(); @@ -40706,6 +40723,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, // only used once or the combined shuffle has included a variable mask // shuffle, this is to avoid constant pool bloat. bool IsOptimizingSize = DAG.shouldOptForSize(); + bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) { + return isTargetShuffleVariableMask(N->getOpcode()); + }); if (IsOptimizingSize && !HasVariableMask && llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); })) return SDValue(); @@ -40807,7 +40827,7 @@ namespace llvm { static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, + unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!RootMask.empty() && @@ -40863,7 +40883,6 @@ static SDValue combineX86ShufflesRecursively( SmallVector OpMask; SmallVector OpInputs; APInt OpUndef, OpZero; - bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, OpZero, DAG, Depth, false)) { // Shuffle inputs must not be larger than the shuffle result. @@ -41078,7 +41097,6 @@ static SDValue combineX86ShufflesRecursively( return getOnesVector(RootVT, DAG, DL); assert(!Ops.empty() && "Shuffle with no inputs detected"); - HasVariableMask |= IsOpVariableMask; // Update the list of shuffle nodes that have been combined so far. SmallVector CombinedNodes(SrcNodes); @@ -41107,15 +41125,14 @@ static SDValue combineX86ShufflesRecursively( } if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, - HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG, - Subtarget)) + AllowCrossLaneVar, AllowPerLaneVar, DAG, Subtarget)) return Res; } } // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( - RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget)) + RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget)) return Cst; // If constant fold failed and we only have constants - then we have @@ -41217,7 +41234,7 @@ static SDValue combineX86ShufflesRecursively( // Try to combine into a single shuffle instruction. if (SDValue Shuffle = combineX86ShuffleChain( - Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return Shuffle; @@ -41236,7 +41253,7 @@ static SDValue combineX86ShufflesRecursively( // If that failed and any input is extracted then try to combine as a // shuffle with the larger type. return combineX86ShuffleChainWithExtract( - Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget); } @@ -41245,7 +41262,6 @@ static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget); } @@ -41883,7 +41899,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, + /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -42222,7 +42238,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask); if (SDValue NewMask = combineX86ShufflesConstants( ShufVT, {MaskLHS, MaskRHS}, ByteMask, - /*HasVariableMask=*/true, DAG, DL, Subtarget)) { + {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) { SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, LHS.getOperand(0), NewMask); SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, @@ -43857,7 +43873,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue NewShuffle = combineX86ShufflesRecursively( {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) @@ -51416,7 +51431,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, + /*AllowVarCrossLaneMask*/ true, /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle, N0.getOperand(1)); diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index c43fd97a055fc..2859195c6c26e 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -163,7 +163,8 @@ void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) { return; if (hasReturn) { - ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles); + unsigned int &NumCycles = ReturnBBs[MBB]; + NumCycles = std::max(NumCycles, Cycles); return; } diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 0a605dfd017cb..8731a16b88a5c 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -104,8 +104,6 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, {{"gfx90a"}, {"gfx90a"}, GK_GFX90A, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx90c"}, {"gfx90c"}, GK_GFX90C, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, - {{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, - {{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, @@ -260,8 +258,6 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX909: return {9, 0, 9}; case GK_GFX90A: return {9, 0, 10}; case GK_GFX90C: return {9, 0, 12}; - case GK_GFX940: return {9, 4, 0}; - case GK_GFX941: return {9, 4, 1}; case GK_GFX942: return {9, 4, 2}; case GK_GFX950: return {9, 5, 0}; case GK_GFX1010: return {10, 1, 0}; @@ -506,8 +502,6 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx950-insts"] = true; [[fallthrough]]; case GK_GFX942: - case GK_GFX941: - case GK_GFX940: Features["fp8-insts"] = true; Features["fp8-conversion-insts"] = true; if (Kind != GK_GFX950) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index c1dd8bc393f33..17e7fada10827 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3970,17 +3970,18 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // TODO: We should track the capturing uses in AANoCapture but the problem // is CGSCC runs. For those we would need to "allow" AANoCapture for // a value in the module slice. - // TODO(captures): Make this more precise. - UseCaptureInfo CI = - DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); - if (capturesNothing(CI)) + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: return true; - if (CI.isPassthrough()) { + case UseCaptureKind::MAY_CAPTURE: + LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI + << "\n"); + return false; + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI << "\n"); - return false; + llvm_unreachable("unknown UseCaptureKind"); }; bool IsKnownNoCapture; @@ -6018,16 +6019,16 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { }; auto UseCheck = [&](const Use &U, bool &Follow) -> bool { - // TODO(captures): Make this more precise. - UseCaptureInfo CI = - DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); - if (capturesNothing(CI)) + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: return true; - if (CI.isPassthrough()) { + case UseCaptureKind::MAY_CAPTURE: + return checkUse(A, T, U, Follow); + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - return checkUse(A, T, U, Follow); + llvm_unreachable("Unexpected use capture kind!"); }; if (!A.checkForAllUses(UseCheck, *this, *V)) @@ -12150,13 +12151,16 @@ struct AAGlobalValueInfoFloating : public AAGlobalValueInfo { auto UsePred = [&](const Use &U, bool &Follow) -> bool { Uses.insert(&U); - // TODO(captures): Make this more precise. - UseCaptureInfo CI = DetermineUseCaptureKind(U, /*Base=*/nullptr, nullptr); - if (CI.isPassthrough()) { + switch (DetermineUseCaptureKind(U, nullptr)) { + case UseCaptureKind::NO_CAPTURE: + return checkUse(A, U, Follow, Worklist); + case UseCaptureKind::MAY_CAPTURE: + return checkUse(A, U, Follow, Worklist); + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - return checkUse(A, U, Follow, Worklist); + return true; }; auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) { Uses.insert(&OldU); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 02b0fcb3981a7..a63e38a7d98ad 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -71,9 +71,7 @@ using namespace llvm; #define DEBUG_TYPE "function-attrs" STATISTIC(NumMemoryAttr, "Number of functions with improved memory attribute"); -STATISTIC(NumCapturesNone, "Number of arguments marked captures(none)"); -STATISTIC(NumCapturesPartial, "Number of arguments marked with captures " - "attribute other than captures(none)"); +STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReturned, "Number of arguments marked returned"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); @@ -110,13 +108,6 @@ static cl::opt DisableThinLTOPropagation( "disable-thinlto-funcattrs", cl::init(true), cl::Hidden, cl::desc("Don't propagate function-attrs in thinLTO")); -static void addCapturesStat(CaptureInfo CI) { - if (capturesNothing(CI)) - ++NumCapturesNone; - else - ++NumCapturesPartial; -} - namespace { using SCCNodeSet = SmallSetVector; @@ -507,9 +498,6 @@ namespace { /// SCC of the arguments. struct ArgumentGraphNode { Argument *Definition; - /// CaptureComponents for this argument, excluding captures via Uses. - /// We don't distinguish between other/return captures here. - CaptureComponents CC = CaptureComponents::None; SmallVector Uses; }; @@ -551,36 +539,18 @@ class ArgumentGraph { struct ArgumentUsesTracker : public CaptureTracker { ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {} - void tooManyUses() override { CI = CaptureInfo::all(); } - - Action captured(const Use *U, UseCaptureInfo UseCI) override { - if (updateCaptureInfo(U, UseCI.UseCC)) { - // Don't bother continuing if we already capture everything. - if (capturesAll(CI.getOtherComponents())) - return Stop; - return Continue; - } - - // For SCC argument tracking, we're not going to analyze other/ret - // components separately, so don't follow the return value. - return ContinueIgnoringReturn; - } + void tooManyUses() override { Captured = true; } - bool updateCaptureInfo(const Use *U, CaptureComponents CC) { + bool captured(const Use *U) override { CallBase *CB = dyn_cast(U->getUser()); if (!CB) { - if (isa(U->getUser())) - CI |= CaptureInfo::retOnly(CC); - else - // Conservatively assume that the captured value might make its way - // into the return value as well. This could be made more precise. - CI |= CaptureInfo(CC); + Captured = true; return true; } Function *F = CB->getCalledFunction(); if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) { - CI |= CaptureInfo(CC); + Captured = true; return true; } @@ -594,24 +564,22 @@ struct ArgumentUsesTracker : public CaptureTracker { // use. In this case it does not matter if the callee is within our SCC // or not -- we've been captured in some unknown way, and we have to be // conservative. - CI |= CaptureInfo(CC); + Captured = true; return true; } if (UseIndex >= F->arg_size()) { assert(F->isVarArg() && "More params than args in non-varargs call"); - CI |= CaptureInfo(CC); + Captured = true; return true; } - // TODO(captures): Could improve precision by remembering maximum - // capture components for the argument. Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); return false; } - // Does not include potential captures via Uses in the SCC. - CaptureInfo CI = CaptureInfo::none(); + // True only if certainly captured (used outside our SCC). + bool Captured = false; // Uses within our SCC. SmallVector Uses; @@ -1226,15 +1194,6 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, bool SkipInitializes) { ArgumentGraph AG; - auto DetermineAccessAttrsForSingleton = [](Argument *A) { - SmallPtrSet Self; - Self.insert(A); - Attribute::AttrKind R = determinePointerAccessAttrs(A, Self); - if (R != Attribute::None) - return addAccessAttr(A, R); - return false; - }; - // Check each function in turn, determining which pointer arguments are not // captured. for (Function *F : SCCNodes) { @@ -1255,7 +1214,7 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (A.getType()->isPointerTy() && !A.hasNoCaptureAttr()) { A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), CaptureInfo::none())); - ++NumCapturesNone; + ++NumNoCapture; Changed.insert(F); } } @@ -1266,23 +1225,21 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (!A.getType()->isPointerTy()) continue; bool HasNonLocalUses = false; - CaptureInfo OrigCI = A.getAttributes().getCaptureInfo(); - if (!capturesNothing(OrigCI)) { + if (!A.hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); PointerMayBeCaptured(&A, &Tracker); - CaptureInfo NewCI = Tracker.CI & OrigCI; - if (NewCI != OrigCI) { + if (!Tracker.Captured) { if (Tracker.Uses.empty()) { - // If the information is complete, add the attribute now. - A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), NewCI)); - addCapturesStat(NewCI); + // If it's trivially not captured, mark it nocapture now. + A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), + CaptureInfo::none())); + ++NumNoCapture; Changed.insert(F); } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. ArgumentGraphNode *Node = AG[&A]; - Node->CC = CaptureComponents(NewCI); for (Argument *Use : Tracker.Uses) { Node->Uses.push_back(AG[Use]); if (Use != &A) @@ -1297,8 +1254,12 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // an SCC? Note that we don't allow any calls at all here, or else our // result will be dependent on the iteration order through the // functions in the SCC. - if (DetermineAccessAttrsForSingleton(&A)) - Changed.insert(F); + SmallPtrSet Self; + Self.insert(&A); + Attribute::AttrKind R = determinePointerAccessAttrs(&A, Self); + if (R != Attribute::None) + if (addAccessAttr(&A, R)) + Changed.insert(F); } if (!SkipInitializes && !A.onlyReadsMemory()) { if (inferInitializes(A, *F)) @@ -1324,17 +1285,17 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { Argument *A = ArgumentSCC[0]->Definition; - CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); - CaptureInfo NewCI = CaptureInfo(ArgumentSCC[0]->CC) & OrigCI; - if (NewCI != OrigCI) { - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); - addCapturesStat(NewCI); - Changed.insert(A->getParent()); - } - - // Infer the access attributes given the new captures one - if (DetermineAccessAttrsForSingleton(A)) - Changed.insert(A->getParent()); + A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), + CaptureInfo::none())); + ++NumNoCapture; + Changed.insert(A->getParent()); + + // Infer the access attributes given the new nocapture one + SmallPtrSet Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); + if (R != Attribute::None) + addAccessAttr(A, R); } continue; } @@ -1346,45 +1307,27 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, ArgumentSCCNodes.insert(I->Definition); } - // At the SCC level, only track merged CaptureComponents. We're not - // currently prepared to handle propagation of return-only captures across - // the SCC. - CaptureComponents CC = CaptureComponents::None; + bool SCCCaptured = false; for (ArgumentGraphNode *N : ArgumentSCC) { for (ArgumentGraphNode *Use : N->Uses) { Argument *A = Use->Definition; - if (ArgumentSCCNodes.count(A)) - CC |= Use->CC; - else - CC |= CaptureComponents(A->getAttributes().getCaptureInfo()); + if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) + continue; + SCCCaptured = true; break; } - if (capturesAll(CC)) + if (SCCCaptured) break; } - - if (!capturesAll(CC)) { - for (ArgumentGraphNode *N : ArgumentSCC) { - Argument *A = N->Definition; - CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); - CaptureInfo NewCI = CaptureInfo(N->CC | CC) & OrigCI; - if (NewCI != OrigCI) { - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); - addCapturesStat(NewCI); - Changed.insert(A->getParent()); - } - } - } - - // TODO(captures): Ignore address-only captures. - if (capturesAnything(CC)) { - // As the pointer may be captured, determine the pointer attributes - // looking at each argument invidivually. - for (ArgumentGraphNode *N : ArgumentSCC) { - if (DetermineAccessAttrsForSingleton(N->Definition)) - Changed.insert(N->Definition->getParent()); - } + if (SCCCaptured) continue; + + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; + A->addAttr( + Attribute::getWithCaptureInfo(A->getContext(), CaptureInfo::none())); + ++NumNoCapture; + Changed.insert(A->getParent()); } // We also want to compute readonly/readnone/writeonly. With a small number diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 9586fc97a39f7..2d046f09f1b2b 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2064,7 +2064,7 @@ static bool destArrayCanBeWidened(CallInst *CI) { return true; } -static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F, +static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, unsigned NumBytesToPad, unsigned NumBytesToCopy) { if (!OldVar->hasInitializer()) @@ -2083,10 +2083,10 @@ static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F, StrData.push_back('\0'); auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad); // Create new padded version of global variable. - Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr); + Constant *SourceReplace = ConstantDataArray::get(OldVar->getContext(), Arr); GlobalVariable *NewGV = new GlobalVariable( - *(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(), - SourceReplace, SourceReplace->getName()); + *(OldVar->getParent()), SourceReplace->getType(), true, + OldVar->getLinkage(), SourceReplace, SourceReplace->getName()); // Copy any other attributes from original global variable // e.g. unamed_addr NewGV->copyAttributesFrom(OldVar); @@ -2114,13 +2114,13 @@ static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad, } } -static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar, +static bool tryWidenGlobalArrayAndDests(GlobalVariable *SourceVar, const unsigned NumBytesToPad, const unsigned NumBytesToCopy, ConstantInt *BytesToCopyOp, ConstantDataArray *SourceDataArray) { auto *NewSourceGV = - widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy); + widenGlobalVariable(SourceVar, NumBytesToPad, NumBytesToCopy); if (!NewSourceGV) return false; @@ -2158,8 +2158,6 @@ static bool tryWidenGlobalArraysUsedByMemcpy( if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI)) continue; - Function *F = CI->getCalledFunction(); - auto *BytesToCopyOp = dyn_cast(CI->getArgOperand(2)); if (!BytesToCopyOp) continue; @@ -2186,10 +2184,12 @@ static bool tryWidenGlobalArraysUsedByMemcpy( if (NumElementsToCopy != DZSize || DZSize != SZSize) continue; - unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray( - NumBytesToCopy, SourceDataArray->getType()); + unsigned NumBytesToPad = + GetTTI(*CI->getFunction()) + .getNumBytesToPadGlobalArray(NumBytesToCopy, + SourceDataArray->getType()); if (NumBytesToPad) { - return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy, + return tryWidenGlobalArrayAndDests(GV, NumBytesToPad, NumBytesToCopy, BytesToCopyOp, SourceDataArray); } } diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index d748b162d7809..0982fd35401cb 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth, "Maximum depth of profiled callees found via tail calls"); STATISTIC(FoundProfiledCalleeNonUniquelyCount, "Number of profiled callees found via multiple tail call chains"); +STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning"); static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -127,6 +128,10 @@ static cl::opt AllowRecursiveCallsites( "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles")); +static cl::opt CloneRecursiveContexts( + "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of contexts through recursive cycles")); + // When disabled, try to detect and prevent cloning of recursive contexts. // This is only necessary until we support cloning through recursive cycles. // Leave on by default for now, as disabling requires a little bit of compile @@ -134,7 +139,7 @@ static cl::opt AllowRecursiveCallsites( // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. static cl::opt AllowRecursiveContexts( "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, - cl::desc("Allow cloning of contexts through recursive cycles")); + cl::desc("Allow cloning of contexts having recursive cycles")); namespace llvm { cl::opt EnableMemProfContextDisambiguation( @@ -293,37 +298,40 @@ class CallsiteContextGraph { // TODO: Should this be a map (from Caller node) for more efficient lookup? std::vector> CallerEdges; - // Get the list of edges from which we can compute allocation information - // such as the context ids and allocation type of this node. - const std::vector> * - getEdgesWithAllocInfo() const { - // If node has any callees, compute from those, otherwise compute from - // callers (i.e. if this is the leaf allocation node). - if (!CalleeEdges.empty()) - return &CalleeEdges; + // Returns true if we need to look at the callee edges for determining the + // node context ids and allocation type. + bool useCallerEdgesForContextInfo() const { // Typically if the callee edges are empty either the caller edges are // also empty, or this is an allocation (leaf node). However, if we are // allowing recursive callsites and contexts this will be violated for // incompletely cloned recursive cycles. - assert(CallerEdges.empty() || IsAllocation || + assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation || (AllowRecursiveCallsites && AllowRecursiveContexts)); - if (!CallerEdges.empty() && IsAllocation) - return &CallerEdges; - return nullptr; + // When cloning for a recursive context, during cloning we might be in the + // midst of cloning for a recurrence and have moved context ids off of a + // caller edge onto the clone but not yet off of the incoming caller + // (back) edge. If we don't look at those we miss the fact that this node + // still has context ids of interest. + return IsAllocation || CloneRecursiveContexts; } // Compute the context ids for this node from the union of its edge context // ids. DenseSet getContextIds() const { - DenseSet ContextIds; - auto *Edges = getEdgesWithAllocInfo(); - if (!Edges) - return {}; unsigned Count = 0; - for (auto &Edge : *Edges) + // Compute the number of ids for reserve below. In general we only need to + // look at one set of edges, typically the callee edges, since other than + // allocations and in some cases during recursion cloning, all the context + // ids on the callers should also flow out via callee edges. + for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges) Count += Edge->getContextIds().size(); + DenseSet ContextIds; ContextIds.reserve(Count); - for (auto &Edge : *Edges) + auto Edges = llvm::concat>( + CalleeEdges, useCallerEdgesForContextInfo() + ? CallerEdges + : std::vector>()); + for (const auto &Edge : Edges) ContextIds.insert(Edge->getContextIds().begin(), Edge->getContextIds().end()); return ContextIds; @@ -332,13 +340,14 @@ class CallsiteContextGraph { // Compute the allocation type for this node from the OR of its edge // allocation types. uint8_t computeAllocType() const { - auto *Edges = getEdgesWithAllocInfo(); - if (!Edges) - return (uint8_t)AllocationType::None; uint8_t BothTypes = (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; uint8_t AllocType = (uint8_t)AllocationType::None; - for (auto &Edge : *Edges) { + auto Edges = llvm::concat>( + CalleeEdges, useCallerEdgesForContextInfo() + ? CallerEdges + : std::vector>()); + for (const auto &Edge : Edges) { AllocType |= Edge->AllocTypes; // Bail early if alloc type reached both, no further refinement. if (AllocType == BothTypes) @@ -350,10 +359,11 @@ class CallsiteContextGraph { // The context ids set for this node is empty if its edge context ids are // also all empty. bool emptyContextIds() const { - auto *Edges = getEdgesWithAllocInfo(); - if (!Edges) - return true; - for (auto &Edge : *Edges) { + auto Edges = llvm::concat>( + CalleeEdges, useCallerEdgesForContextInfo() + ? CallerEdges + : std::vector>()); + for (const auto &Edge : Edges) { if (!Edge->getContextIds().empty()) return false; } @@ -434,6 +444,14 @@ class CallsiteContextGraph { // for contexts including this edge. uint8_t AllocTypes = 0; + // Set just before initiating cloning when cloning of recursive contexts is + // enabled. Used to defer cloning of backedges until we have done cloning of + // the callee node for non-backedge caller edges. This exposes cloning + // opportunities through the backedge of the cycle. + // TODO: Note that this is not updated during cloning, and it is unclear + // whether that would be needed. + bool IsBackedge = false; + // The set of IDs for contexts including this edge. DenseSet ContextIds; @@ -722,6 +740,9 @@ class CallsiteContextGraph { void moveCalleeEdgeToNewCaller(const std::shared_ptr &Edge, ContextNode *NewCaller); + void markBackedges(ContextNode *Node, DenseSet &Visited, + DenseSet &CurrentStack); + /// Recursively perform cloning on the graph for the given Node and its /// callers, in order to uniquely identify the allocation behavior of an /// allocation given its context. The context ids of the allocation being @@ -2874,6 +2895,7 @@ template void CallsiteContextGraph::ContextEdge::print( raw_ostream &OS) const { OS << "Edge from Callee " << Callee << " to Caller: " << Caller + << (IsBackedge ? " (BE)" : "") << " AllocTypes: " << getAllocTypeString(AllocTypes); OS << " ContextIds:"; std::vector SortedIds(ContextIds.begin(), ContextIds.end()); @@ -3115,6 +3137,8 @@ void CallsiteContextGraph:: // node (Edge's current callee may be the original node too). assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); + bool EdgeIsRecursive = Edge->Callee == Edge->Caller; + ContextNode *OldCallee = Edge->Callee; // We might already have an edge to the new callee from earlier cloning for a @@ -3181,8 +3205,16 @@ void CallsiteContextGraph:: // If this is a direct recursion edge, use NewCallee (the clone) as the // callee as well, so that any edge updated/created here is also direct // recursive. - if (CalleeToUse == OldCallee) + if (CalleeToUse == OldCallee) { + // If this is a recursive edge, see if we already moved a recursive edge + // (which would have to have been this one) - if we were only moving a + // subset of context ids it would still be on OldCallee. + if (EdgeIsRecursive) { + assert(OldCalleeEdge == Edge); + continue; + } CalleeToUse = NewCallee; + } // The context ids moving to the new callee are the subset of this edge's // context ids and the context ids on the caller edge being moved. DenseSet EdgeContextIdsToMove = @@ -3369,9 +3401,47 @@ void CallsiteContextGraph:: } } +// This is the standard DFS based backedge discovery algorithm. +template +void CallsiteContextGraph::markBackedges( + ContextNode *Node, DenseSet &Visited, + DenseSet &CurrentStack) { + auto I = Visited.insert(Node); + // We should only call this for unvisited nodes. + assert(I.second); + for (auto &CalleeEdge : Node->CalleeEdges) { + auto *Callee = CalleeEdge->Callee; + if (Visited.count(Callee)) { + // Since this was already visited we need to check if it is currently on + // the recursive stack in which case it is a backedge. + if (CurrentStack.count(Callee)) + CalleeEdge->IsBackedge = true; + continue; + } + CurrentStack.insert(Callee); + markBackedges(Callee, Visited, CurrentStack); + CurrentStack.erase(Callee); + } +} + template void CallsiteContextGraph::identifyClones() { + // If we are cloning recursive contexts, find and mark backedges from all root + // callers, using the typical DFS based backedge analysis. DenseSet Visited; + if (CloneRecursiveContexts) { + DenseSet CurrentStack; + for (auto &Entry : NonAllocationCallToContextNodeMap) { + auto *Node = Entry.second; + if (Node->isRemoved()) + continue; + // It is a root if it doesn't have callers. + if (!Node->CallerEdges.empty()) + continue; + markBackedges(Node, Visited, CurrentStack); + assert(CurrentStack.empty()); + } + } for (auto &Entry : AllocationCallToContextNodeMap) { Visited.clear(); identifyClones(Entry.second, Visited, Entry.second->getContextIds()); @@ -3430,6 +3500,14 @@ void CallsiteContextGraph::identifyClones( assert(!is_contained(Node->CallerEdges, Edge)); continue; } + // Defer backedges. See comments further below where these edges are + // handled during the cloning of this Node. + if (Edge->IsBackedge) { + // We should only mark these if cloning recursive contexts, where we + // need to do this deferral. + assert(CloneRecursiveContexts); + continue; + } // Ignore any caller we previously visited via another edge. if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { identifyClones(Edge->Caller, Visited, AllocContextIds); @@ -3483,6 +3561,7 @@ void CallsiteContextGraph::identifyClones( assert(Node->AllocTypes != (uint8_t)AllocationType::None); DenseSet RecursiveContextIds; + assert(AllowRecursiveContexts || !CloneRecursiveContexts); // If we are allowing recursive callsites, but have also disabled recursive // contexts, look for context ids that show up in multiple caller edges. if (AllowRecursiveCallsites && !AllowRecursiveContexts) { @@ -3505,6 +3584,13 @@ void CallsiteContextGraph::identifyClones( // makes it less error-prone. auto CallerEdges = Node->CallerEdges; for (auto &CallerEdge : CallerEdges) { + // Skip any that have been removed by an earlier recursive call. + if (CallerEdge->isRemoved()) { + assert(!is_contained(Node->CallerEdges, CallerEdge)); + continue; + } + assert(CallerEdge->Callee == Node); + // See if cloning the prior caller edge left this node with a single alloc // type or a single caller. In that case no more cloning of Node is needed. if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) @@ -3546,13 +3632,100 @@ void CallsiteContextGraph::identifyClones( // // Then check if by cloning node at least one of the callee edges will be // disambiguated by splitting out different context ids. + // + // However, always do the cloning if this is a backedge, in which case we + // have not yet cloned along this caller edge. assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); assert(Node->AllocTypes != (uint8_t)AllocationType::None); - if (allocTypeToUse(CallerAllocTypeForAlloc) == + if (!CallerEdge->IsBackedge && + allocTypeToUse(CallerAllocTypeForAlloc) == allocTypeToUse(Node->AllocTypes) && allocTypesMatch( - CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) + CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { continue; + } + + if (CallerEdge->IsBackedge) { + // We should only mark these if cloning recursive contexts, where we + // need to do this deferral. + assert(CloneRecursiveContexts); + DeferredBackedges++; + } + + // If this is a backedge, we now do recursive cloning starting from its + // caller since we may have moved unambiguous caller contexts to a clone + // of this Node in a previous iteration of the current loop, giving more + // opportunity for cloning through the backedge. Because we sorted the + // caller edges earlier so that cold caller edges are first, we would have + // visited and cloned this node for any unamibiguously cold non-recursive + // callers before any ambiguous backedge callers. Note that we don't do this + // if the caller is already cloned or visited during cloning (e.g. via a + // different context path from the allocation). + // TODO: Can we do better in the case where the caller was already visited? + if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf && + !Visited.count(CallerEdge->Caller)) { + const auto OrigIdCount = CallerEdge->getContextIds().size(); + // Now do the recursive cloning of this backedge's caller, which was + // deferred earlier. + identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc); + removeNoneTypeCalleeEdges(CallerEdge->Caller); + // See if the recursive call to identifyClones moved the context ids to a + // new edge from this node to a clone of caller, and switch to looking at + // that new edge so that we clone Node for the new caller clone. + bool UpdatedEdge = false; + if (OrigIdCount > CallerEdge->getContextIds().size()) { + for (auto E : Node->CallerEdges) { + // Only interested in clones of the current edges caller. + if (E->Caller->CloneOf != CallerEdge->Caller) + continue; + // See if this edge contains any of the context ids originally on the + // current caller edge. + auto CallerEdgeContextsForAllocNew = + set_intersection(CallerEdgeContextsForAlloc, E->getContextIds()); + if (CallerEdgeContextsForAllocNew.empty()) + continue; + // Make sure we don't pick a previously existing caller edge of this + // Node, which would be processed on a different iteration of the + // outer loop over the saved CallerEdges. + if (std::find(CallerEdges.begin(), CallerEdges.end(), E) != + CallerEdges.end()) + continue; + // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge + // are updated further below for all cases where we just invoked + // identifyClones recursively. + CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew); + CallerEdge = E; + UpdatedEdge = true; + break; + } + } + // If cloning removed this edge (and we didn't update it to a new edge + // above), we're done with this edge. It's possible we moved all of the + // context ids to an existing clone, in which case there's no need to do + // further processing for them. + if (CallerEdge->isRemoved()) + continue; + + // Now we need to update the information used for the cloning decisions + // further below, as we may have modified edges and their context ids. + + // Note if we changed the CallerEdge above we would have already updated + // the context ids. + if (!UpdatedEdge) { + CallerEdgeContextsForAlloc = set_intersection( + CallerEdgeContextsForAlloc, CallerEdge->getContextIds()); + if (CallerEdgeContextsForAlloc.empty()) + continue; + } + // Update the other information that depends on the edges and on the now + // updated CallerEdgeContextsForAlloc. + CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); + CalleeEdgeAllocTypesForCallerEdge.clear(); + for (auto &CalleeEdge : Node->CalleeEdges) { + CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( + CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc)); + } + } // First see if we can use an existing clone. Check each clone and its // callee edges for matching alloc types. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 76020d2b1dbf4..00a8117f32e70 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -882,8 +882,7 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { void tooManyUses() override { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { auto *ICmp = dyn_cast(U->getUser()); // We need to check that U is based *only* on the alloca, and doesn't // have other contributions from a select/phi operand. @@ -893,11 +892,11 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { // Collect equality icmps of the alloca, and don't treat them as // captures. ICmps[ICmp] |= 1u << U->getOperandNo(); - return Continue; + return false; } Captured = true; - return Stop; + return true; } }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 2e14145aef884..e621a0b7fe596 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -205,11 +205,15 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, unsigned ValZeros = ValC.logBase2(); unsigned AndZeros = AndMask.logBase2(); bool ShouldNotVal = !TC.isZero(); + bool NeedShift = ValZeros != AndZeros; + bool NeedZExtTrunc = + SelType->getScalarSizeInBits() != V->getType()->getScalarSizeInBits(); - // If we would need to create an 'and' + 'shift' + 'xor' to replace a 'select' - // + 'icmp', then this transformation would result in more instructions and - // potentially interfere with other folding. - if (CreateAnd && ShouldNotVal && ValZeros != AndZeros) + // If we would need to create an 'and' + 'shift' + 'xor' + cast to replace + // a 'select' + 'icmp', then this transformation would result in more + // instructions and potentially interfere with other folding. + if (CreateAnd + ShouldNotVal + NeedShift + NeedZExtTrunc > + 1 + Cmp->hasOneUse()) return nullptr; // Insert the 'and' instruction on the input to the truncate. @@ -742,39 +746,47 @@ static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal, /// 1. The icmp predicate is inverted /// 2. The select operands are reversed /// 3. The magnitude of C2 and C1 are flipped -static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, - Value *FalseVal, - InstCombiner::BuilderTy &Builder) { +static Value *foldSelectICmpAndBinOp(Value *CondVal, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { // Only handle integer compares. Also, if this is a vector select, we need a // vector compare. if (!TrueVal->getType()->isIntOrIntVectorTy() || - TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy()) + TrueVal->getType()->isVectorTy() != CondVal->getType()->isVectorTy()) return nullptr; - Value *CmpLHS = IC->getOperand(0); - Value *CmpRHS = IC->getOperand(1); - unsigned C1Log; bool NeedAnd = false; - CmpInst::Predicate Pred = IC->getPredicate(); - if (IC->isEquality()) { - if (!match(CmpRHS, m_Zero())) - return nullptr; + CmpPredicate Pred; + Value *CmpLHS, *CmpRHS; - const APInt *C1; - if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) - return nullptr; + if (match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)))) { + if (ICmpInst::isEquality(Pred)) { + if (!match(CmpRHS, m_Zero())) + return nullptr; - C1Log = C1->logBase2(); - } else { - auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred); - if (!Res || !Res->Mask.isPowerOf2()) - return nullptr; + const APInt *C1; + if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) + return nullptr; - CmpLHS = Res->X; - Pred = Res->Pred; - C1Log = Res->Mask.logBase2(); - NeedAnd = true; + C1Log = C1->logBase2(); + } else { + auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred); + if (!Res || !Res->Mask.isPowerOf2()) + return nullptr; + + CmpLHS = Res->X; + Pred = Res->Pred; + C1Log = Res->Mask.logBase2(); + NeedAnd = true; + } + } else if (auto *Trunc = dyn_cast(CondVal)) { + CmpLHS = Trunc->getOperand(0); + C1Log = 0; + Pred = ICmpInst::ICMP_NE; + NeedAnd = !Trunc->hasNoUnsignedWrap(); + } else { + return nullptr; } Value *Y, *V = CmpLHS; @@ -808,7 +820,7 @@ static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, // Make sure we don't create more instructions than we save. if ((NeedShift + NeedXor + NeedZExtTrunc + NeedAnd) > - (IC->hasOneUse() + BinOp->hasOneUse())) + (CondVal->hasOneUse() + BinOp->hasOneUse())) return nullptr; if (NeedAnd) { @@ -829,7 +841,10 @@ static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, if (NeedXor) V = Builder.CreateXor(V, *C2); - return Builder.CreateBinOp(BinOp->getOpcode(), Y, V); + auto *Res = Builder.CreateBinOp(BinOp->getOpcode(), Y, V); + if (auto *BO = dyn_cast(Res)) + BO->copyIRFlags(BinOp); + return Res; } /// Canonicalize a set or clear of a masked set of constant bits to @@ -1983,9 +1998,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, if (Instruction *V = foldSelectZeroOrOnes(ICI, TrueVal, FalseVal, Builder)) return V; - if (Value *V = foldSelectICmpAndBinOp(ICI, TrueVal, FalseVal, Builder)) - return replaceInstUsesWith(SI, V); - if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder)) return replaceInstUsesWith(SI, V); @@ -3943,6 +3955,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { if (Instruction *Result = foldSelectInstWithICmp(SI, ICI)) return Result; + if (Value *V = foldSelectICmpAndBinOp(CondVal, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + if (Instruction *Add = foldAddSubSelect(SI, Builder)) return Add; if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder)) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a1649c276de83..f3b53e05c519e 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -174,7 +174,7 @@ static cl::opt EnableInitializesImprovement( // Helper functions //===----------------------------------------------------------------------===// using OverlapIntervalsTy = std::map; -using InstOverlapIntervalsTy = DenseMap; +using InstOverlapIntervalsTy = MapVector; /// Returns true if the end of this instruction can be safely shortened in /// length. diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9a729b7afb8b9..87b27beb01a0a 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1550,33 +1550,32 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, } if (!Visited.insert(&U).second) continue; - UseCaptureInfo CI = - DetermineUseCaptureKind(U, AI, IsDereferenceableOrNull); - // TODO(captures): Make this more precise. - if (CI.isPassthrough()) { + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::MAY_CAPTURE: + return false; + case UseCaptureKind::PASSTHROUGH: + // Instructions cannot have non-instruction users. Worklist.push_back(UI); continue; - } - - if (capturesAnything(CI)) - return false; - - if (UI->isLifetimeStartOrEnd()) { - // We note the locations of these intrinsic calls so that we can - // delete them later if the optimization succeeds, this is safe - // since both llvm.lifetime.start and llvm.lifetime.end intrinsics - // practically fill all the bytes of the alloca with an undefined - // value, although conceptually marked as alive/dead. - int64_t Size = cast(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; + case UseCaptureKind::NO_CAPTURE: { + if (UI->isLifetimeStartOrEnd()) { + // We note the locations of these intrinsic calls so that we can + // delete them later if the optimization succeeds, this is safe + // since both llvm.lifetime.start and llvm.lifetime.end intrinsics + // practically fill all the bytes of the alloca with an undefined + // value, although conceptually marked as alive/dead. + int64_t Size = cast(UI->getOperand(0))->getSExtValue(); + if (Size < 0 || Size == DestSize) { + LifetimeMarkers.push_back(UI); + continue; + } } + if (UI->hasMetadata(LLVMContext::MD_noalias)) + NoAliasInstrs.insert(UI); + if (!ModRefCallback(UI)) + return false; + } } - if (UI->hasMetadata(LLVMContext::MD_noalias)) - NoAliasInstrs.insert(UI); - if (!ModRefCallback(UI)) - return false; } } return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 22c2f91ff55f6..cf0ba6fa54700 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -366,24 +366,6 @@ void PlainCFGBuilder::buildPlainCFG( // latter. BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB; Loop2Region[LI->getLoopFor(TheLoop->getHeader())] = TheRegion; - BasicBlock *ExitBB = TheLoop->getUniqueExitBlock(); - if (!ExitBB) { - // If there is no unique exit block, we must exit via the latch. This exit - // is mapped to the middle block in the input plan. - BasicBlock *Latch = TheLoop->getLoopLatch(); - auto *Br = cast(Latch->getTerminator()); - if (TheLoop->contains(Br->getSuccessor(0))) { - assert(!TheLoop->contains(Br->getSuccessor(1)) && - "latch must exit the loop"); - ExitBB = Br->getSuccessor(1); - } else { - assert(!TheLoop->contains(Br->getSuccessor(0)) && - "latch must exit the loop"); - ExitBB = Br->getSuccessor(0); - } - } - assert(ExitBB && "Must have a unique exit block or also exit via the latch."); - BB2VPBB[ExitBB] = cast(TheRegion->getSingleSuccessor()); // The existing vector region's entry and exiting VPBBs correspond to the loop // header and latch. diff --git a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll index cef11b94f3873..872715f31011d 100644 --- a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll +++ b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll @@ -2,8 +2,9 @@ declare { ptr, i1 } @get_struct() declare <2 x ptr> @get_vec() +declare void @escape(ptr) -; CHECK: MayAlias: i32* %a, i32* %extract +; CHECK: NoAlias: i32* %a, i32* %extract define i32 @test_extractvalue() { %a = alloca i32 %call = call { ptr, i1 } @get_struct() @@ -13,7 +14,7 @@ define i32 @test_extractvalue() { ret i32 %v } -; CHECK: MayAlias: i32* %a, i32* %extract +; CHECK: NoAlias: i32* %a, i32* %extract define i32 @test_extractelement() { %a = alloca i32 %call = call <2 x ptr> @get_vec() @@ -22,3 +23,25 @@ define i32 @test_extractelement() { %v = load i32, ptr %a ret i32 %v } + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractvalue_escape() { + %a = alloca i32 + call void @escape(ptr %a) + %call = call { ptr, i1 } @get_struct() + %extract = extractvalue { ptr, i1 } %call, 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractelement_escape() { + %a = alloca i32 + call void @escape(ptr %a) + %call = call <2 x ptr> @get_vec() + %extract = extractelement <2 x ptr> %call, i32 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll index 5e9dc7f2b91cc..38b7389ae9083 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll @@ -83,10 +83,52 @@ exit: define void @dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs(ptr %a, ptr %b, ptr %c, i64 %offset, i64 %n) { ; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: cannot check memory dependencies at runtime +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP7:0x[0-9a-f]+]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP8:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 3: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP7]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 4: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 5: +; CHECK-NEXT: Comparing group ([[GRP7]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP5]]: +; CHECK-NEXT: (Low: %a High: ((4 * %n) + %a)) +; CHECK-NEXT: Member: {%a,+,4}<%loop> +; CHECK-NEXT: Group [[GRP6]]: +; CHECK-NEXT: (Low: %b High: ((4 * %n) + %b)) +; CHECK-NEXT: Member: {%b,+,4}<%loop> +; CHECK-NEXT: Group [[GRP7]]: +; CHECK-NEXT: (Low: %c High: ((4 * %n) + %c)) +; CHECK-NEXT: Member: {%c,+,4}<%loop> +; CHECK-NEXT: Group [[GRP8]]: +; CHECK-NEXT: (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a)) +; CHECK-NEXT: Member: {((4 * %offset) + %a),+,4}<%loop> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: diff --git a/llvm/test/Analysis/ProfileSummary/basic.ll b/llvm/test/Analysis/ProfileSummary/basic.ll index c4f48ccafde86..0385c3a921c01 100644 --- a/llvm/test/Analysis/ProfileSummary/basic.ll +++ b/llvm/test/Analysis/ProfileSummary/basic.ll @@ -7,9 +7,9 @@ define void @f1() !prof !20 { ; CHECK-LABEL: f1 :hot -; OVERRIDE-HOT-LABEL: f1 +; OVERRIDE-HOT-LABEL: f1{{$}} ; OVERRIDE-COLD-LABEL: f1 :hot -; OVERRIDE-BOTH-LABEL: f1 +; OVERRIDE-BOTH-LABEL: f1{{$}} ; HOT-CUTOFF-0-LABEL: f1{{$}} ; COLD-CUTOFF-0-LABEL: f1 :cold @@ -19,8 +19,8 @@ define void @f1() !prof !20 { define void @f2() !prof !21 { ; CHECK-LABEL: f2 :cold ; OVERRIDE-HOT-LABEL: f2 :cold -; OVERRIDE-COLD-LABEL: f2 -; OVERRIDE-BOTH-LABEL: f2 +; OVERRIDE-COLD-LABEL: f2{{$}} +; OVERRIDE-BOTH-LABEL: f2 :cold ; HOT-CUTOFF-0-LABEL: f2 :cold ; COLD-CUTOFF-0-LABEL: f2 :cold @@ -28,10 +28,10 @@ define void @f2() !prof !21 { } define void @f3() !prof !22 { -; CHECK-LABEL: f3 -; OVERRIDE-HOT-LABEL: f3 -; OVERRIDE-COLD-LABEL: f3 -; OVERRIDE-BOTH-LABEL: f3 +; CHECK-LABEL: f3 :hot +; OVERRIDE-HOT-LABEL: f3{{$}} +; OVERRIDE-COLD-LABEL: f3 :hot +; OVERRIDE-BOTH-LABEL: f3 :cold ; HOT-CUTOFF-0-LABEL: f3{{$}} ; COLD-CUTOFF-0-LABEL: f3 :cold diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll index 72f092adf5054..77b78fb4bd4f8 100644 --- a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll +++ b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll @@ -1,7 +1,7 @@ ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata-2.bc | FileCheck %s ; CHECK-LABEL: define void @_ZN4alsa3pcm3PCM17hw_params_current17hf1c237aece2f69c4E() { -; CHECK: #dbg_value(ptr undef, !4, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !14 +; CHECK: #dbg_value(ptr poison, !4, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !14 ; CHECK-LABEL: define void @_ZN4alsa3pcm8HwParams3any17h02a64cfc85ce8a66E() { -; CHECK: #dbg_value(ptr undef, !23, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !28 +; CHECK: #dbg_value(ptr poison, !23, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !28 diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll index ecc39a86c6327..84b1a8f5ba45d 100644 --- a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll +++ b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll @@ -1,4 +1,4 @@ ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata.bc | FileCheck %s ; CHECK-LABEL: define void @test() { -; CHECK: #dbg_value(i64 undef, !4, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !13 +; CHECK: #dbg_value(i64 poison, !4, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !13 diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll index bfe9ab8424bb0..0bd7c1b10b123 100644 --- a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll +++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll @@ -7,7 +7,7 @@ define half @f2h(float %a) { ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 0c3a40d93d640..21729b9dfd101 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -60,13 +60,13 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -148,13 +148,13 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -712,22 +712,22 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 24088998f36d1..9b5e48d2b4217 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -62,13 +62,13 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -150,13 +150,13 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -592,22 +592,22 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 65f1f4863c173..f6c542fe7d407 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -62,13 +62,13 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -150,13 +150,13 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -592,22 +592,22 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 0f1a2f03c98c3..82e0f14e68e26 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -60,13 +60,13 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -148,13 +148,13 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -712,22 +712,22 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll index e9722f348f411..363f0a0598e23 100644 --- a/llvm/test/CodeGen/AArch64/cpus.ll +++ b/llvm/test/CodeGen/AArch64/cpus.ll @@ -18,6 +18,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a77 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a78 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-x1 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=grace 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-e1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-n1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-n2 2>&1 | FileCheck %s diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll index 3db802a2bc355..63b8a1cee27ae 100644 --- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -22,7 +22,7 @@ define void @f16_arg(half %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w30, -16 ; NOFP16-NEXT: and w0, w0, #0xffff ; NOFP16-NEXT: mov x19, x1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: str w0, [x19] ; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload ; NOFP16-NEXT: ret @@ -44,10 +44,10 @@ define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: and w0, w0, #0xffff ; NOFP16-NEXT: mov x19, x2 ; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: stp w21, w0, [x19] ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -73,14 +73,14 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: and w0, w1, #0xffff ; NOFP16-NEXT: mov x19, x3 ; NOFP16-NEXT: mov w20, w2 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w8, w0 ; NOFP16-NEXT: and w0, w20, #0xffff ; NOFP16-NEXT: orr x21, x8, x22, lsl #32 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: str x21, [x19] ; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: str w0, [x19, #8] @@ -110,16 +110,16 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w20, w3 ; NOFP16-NEXT: mov w21, w2 ; NOFP16-NEXT: mov w22, w1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w23, w0 ; NOFP16-NEXT: and w0, w22, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: stp w21, w0, [x19, #8] ; NOFP16-NEXT: stp w23, w22, [x19] ; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload @@ -137,7 +137,7 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; NOFP16-NEXT: .cfi_def_cfa_offset 16 ; NOFP16-NEXT: .cfi_offset w30, -16 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; NOFP16-NEXT: ret %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -155,10 +155,10 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w30, -32 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w20 ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload @@ -180,13 +180,13 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w2 ; NOFP16-NEXT: mov w19, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w19 ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: mov w2, w21 @@ -212,16 +212,16 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w0, w3 ; NOFP16-NEXT: mov w19, w2 ; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w21 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w20 ; NOFP16-NEXT: mov w2, w19 ; NOFP16-NEXT: mov w3, w22 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index d1a303b41deef..ed0a522f6c11d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -602,15 +602,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -757,21 +755,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index b8538cbf254fc..0d02c0d8cb464 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -602,15 +602,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -757,21 +755,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll index c7676e9da6f49..0688b5e42cc4c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll @@ -60,8 +60,6 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmin = call float @llvm.minnum.f32(float %a, float 10.0) @@ -158,8 +156,6 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmin = call float @llvm.minnum.f32(float %a, float 10.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 99e6c5d06a0e1..0b09cabf25a16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @v_fma_f32(float %x, float %y, float %z) { ; GFX6-LABEL: v_fma_f32: @@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %fma } @@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_lhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z) ret half %fma @@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_rhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg half %y %fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z) ret half %fma @@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_add: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg half %z %fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z) ret half %fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll index 75c4cd53e3bfc..b0b41c1c466e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -82,8 +82,6 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call half @llvm.maxnum.f16(half %a, half 2.0) @@ -474,8 +472,6 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) @@ -507,8 +503,6 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) @@ -541,8 +535,6 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll index ae2bcbbb81b5f..a6e6b84bba304 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll @@ -20,8 +20,12 @@ define void @non_kernel_function() sanitize_address { ; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] -; CHECK-NEXT: [[Y:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(1) [[TMP13]] to ptr ; CHECK-NEXT: store i8 5, ptr [[TMP9]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll index 3a05f93df35a3..b9b4c90daea87 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP34:%.*]] = addrspacecast ptr addrspace(1) [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP35]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP36]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -45,16 +49,16 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP25:%.*]] = and i1 [[TMP21]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP25]]) ; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 0 -; CHECK-NEXT: br i1 [[TMP27]], label [[ASAN_REPORT:%.*]], label [[TMP30:%.*]], !prof [[PROF2:![0-9]+]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP25]], label [[TMP28:%.*]], label [[TMP29:%.*]] -; CHECK: 28: +; CHECK-NEXT: br i1 [[TMP27]], label %[[ASAN_REPORT:.*]], label %[[BB35:.*]], !prof [[PROF2:![0-9]+]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP25]], label %[[BB33:.*]], label %[[BB34:.*]] +; CHECK: [[BB33]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7:[0-9]+]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP29]] -; CHECK: 29: -; CHECK-NEXT: br label [[TMP30]] -; CHECK: 30: +; CHECK-NEXT: br label %[[BB34]] +; CHECK: [[BB34]]: +; CHECK-NEXT: br label %[[BB35]] +; CHECK: [[BB35]]: ; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8 ; CHECK-NEXT: ret void ; @@ -67,15 +71,15 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -100,9 +104,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -124,16 +128,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0 -; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT:%.*]], label [[TMP46:%.*]], !prof [[PROF2]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[CONDFREE:%.*]] -; CHECK: 44: +; CHECK-NEXT: br i1 [[TMP43]], label %[[ASAN_REPORT:.*]], label %[[BB46:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP41]], label %[[BB44:.*]], label %[[BB45:.*]] +; CHECK: [[BB44]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[CONDFREE]] -; CHECK: 45: -; CHECK-NEXT: br label [[TMP46]] -; CHECK: 46: +; CHECK-NEXT: br label %[[BB45]] +; CHECK: [[BB45]]: +; CHECK-NEXT: br label %[[BB46]] +; CHECK: [[BB46]]: ; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1 ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] @@ -152,16 +156,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP54]], [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]]) ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0 -; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT1:%.*]], label [[TMP64:%.*]], !prof [[PROF2]] -; CHECK: asan.report1: -; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]] -; CHECK: 64: +; CHECK-NEXT: br i1 [[TMP61]], label %[[ASAN_REPORT1:.*]], label %[[BB66:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT1]]: +; CHECK-NEXT: br i1 [[TMP59]], label %[[BB64:.*]], label %[[BB65:.*]] +; CHECK: [[BB64]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP83]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP63]] -; CHECK: 65: -; CHECK-NEXT: br label [[TMP64]] -; CHECK: 66: +; CHECK-NEXT: br label %[[BB65]] +; CHECK: [[BB65]]: +; CHECK-NEXT: br label %[[BB66]] +; CHECK: [[BB66]]: ; CHECK-NEXT: [[TMP84:%.*]] = ptrtoint ptr addrspace(1) [[TMP82]] to i64 ; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP84]], 3 ; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP85]], 2147450880 @@ -174,28 +178,28 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP76:%.*]] = and i1 [[TMP72]], [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP76]]) ; CHECK-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP77]], 0 -; CHECK-NEXT: br i1 [[TMP78]], label [[ASAN_REPORT2:%.*]], label [[TMP81:%.*]], !prof [[PROF2]] -; CHECK: asan.report2: -; CHECK-NEXT: br i1 [[TMP76]], label [[TMP79:%.*]], label [[TMP80:%.*]] -; CHECK: 79: +; CHECK-NEXT: br i1 [[TMP78]], label %[[ASAN_REPORT2:.*]], label %[[BB81:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT2]]: +; CHECK-NEXT: br i1 [[TMP76]], label %[[BB79:.*]], label %[[BB80:.*]] +; CHECK: [[BB79]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP84]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP80]] -; CHECK: 80: -; CHECK-NEXT: br label [[TMP81]] -; CHECK: 81: +; CHECK-NEXT: br label %[[BB80]] +; CHECK: [[BB80]]: +; CHECK-NEXT: br label %[[BB81]] +; CHECK: [[BB81]]: ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll index 1dd391ec6321a..255dda562c1ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,50 +6,64 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,56 +72,56 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,48 +130,48 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,56 +180,56 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -237,14 +251,16 @@ define private void @store_A() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: store ptr [[TMP11]], ptr null, align 8 ; CHECK-NEXT: ret void ; store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null @@ -256,14 +272,16 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: ret ptr [[TMP10]] +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: ret ptr [[TMP11]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) } @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll index ed9107764eb91..7184ebbb8faa3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,18 +6,32 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -33,23 +47,23 @@ define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,16 +72,16 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -90,24 +104,24 @@ define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,16 +130,16 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -141,23 +155,23 @@ define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,16 +180,16 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -198,24 +212,24 @@ define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -243,7 +257,9 @@ define private void @store_A() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 ; CHECK-NEXT: ret void ; @@ -262,7 +278,9 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: ret ptr [[TMP10]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll index b9fa89dd6f0a6..704bc9e635294 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll @@ -29,8 +29,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP9]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) [[TMP10]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP11]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(1) [[TMP19]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP12]] to i32 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll index 11e912287c7f7..8f5abe962f8eb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = addrspacecast ptr addrspace(1) [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -44,16 +48,16 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -78,9 +82,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -94,17 +98,17 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() @@ -124,5 +128,6 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} -; CHECK: [[META1]] = !{i32 0} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +; CHECK: [[META2]] = !{i32 0} ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll new file mode 100644 index 0000000000000..1973a0acf4659 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s +@lds = internal addrspace(3) global [5 x i32] poison, align 16 + +;. +; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 16, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 20, i32 64 } }, no_sanitize_address +;. +define amdgpu_kernel void @k0() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 52 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 44) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = addrspacecast ptr addrspace(1) [[TMP23]] to ptr +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [5 x i32], ptr [[TMP24]], i64 0, i64 0 +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + %gep = getelementptr inbounds [5 x i32], ptr addrspacecast (ptr addrspace(3) @lds to ptr), i64 0, i64 0 + store i32 1, ptr %gep, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll new file mode 100644 index 0000000000000..34caf91def933 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if vector of static LDS ptrs accesses in kernel are lowered correctly. +@lds_var1 = internal addrspace(3) global i32 poison +@lds_var2 = internal addrspace(3) global i32 poison + +;. +; CHECK: @llvm.amdgcn.sw.lds.example = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.example.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.example.md.type { %llvm.amdgcn.sw.lds.example.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address +;. +define amdgpu_kernel void @example() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @example( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[ENTRY:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: br label %[[ENTRY]] +; CHECK: [[ENTRY]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP23]] +; CHECK-NEXT: [[VEC_LDS_PTRS:%.*]] = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) [[TMP22]], i32 0 +; CHECK-NEXT: [[VEC_LDS_PTRS1:%.*]] = insertelement <2 x ptr addrspace(3)> [[VEC_LDS_PTRS]], ptr addrspace(3) [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VEC_LDS_PTRS1]] to <2 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP32:%.*]] = addrspacecast <2 x ptr addrspace(1)> [[TMP31]] to <2 x ptr> +; CHECK-NEXT: [[ELEM0:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 0 +; CHECK-NEXT: store i32 42, ptr [[ELEM0]], align 4 +; CHECK-NEXT: [[ELEM1:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 1 +; CHECK-NEXT: store i32 43, ptr [[ELEM1]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP33:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP35]], i64 [[TMP34]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; +entry: + ; Create a vector of flat pointers + %vec_lds_ptrs = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) @lds_var1, i32 0 + %vec_lds_ptrs1 = insertelement <2 x ptr addrspace(3)> %vec_lds_ptrs, ptr addrspace(3) @lds_var2, i32 1 + %vec_flat_ptrs = addrspacecast <2 x ptr addrspace(3)> %vec_lds_ptrs1 to <2 x ptr> + %elem0 = extractelement <2 x ptr> %vec_flat_ptrs, i32 0 + store i32 42, ptr %elem0, align 4 + %elem1 = extractelement <2 x ptr> %vec_flat_ptrs, i32 1 + store i32 43, ptr %elem1, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 38adf60888eca..6d8671c7f78a3 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1195,23 +1195,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1389,26 +1386,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1598,17 +1593,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -2002,23 +1995,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2289,23 +2279,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2487,46 +2474,44 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2851,39 +2836,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3203,15 +3186,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3226,30 +3209,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 +; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v5 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3265,14 +3245,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3280,7 +3260,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5372,23 +5352,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v2 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5694,24 +5670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6003,7 +5978,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6019,25 +5994,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v7, v8, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6053,14 +6026,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6068,7 +6041,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 2b8cea9068d87..0f40c2d563111 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1195,23 +1195,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1389,26 +1386,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1598,17 +1593,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -2002,23 +1995,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2289,23 +2279,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2487,46 +2474,44 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2851,39 +2836,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3203,15 +3186,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3226,30 +3209,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 +; GFX12-NEXT: v_min_num_f16_e32 v6, v6, v5 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3265,14 +3245,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3280,7 +3260,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5372,23 +5352,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v4, v5, v2 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5694,24 +5670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_min_num_f16 v1, v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6003,7 +5978,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6019,25 +5994,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_min_num_f16 v7, v8, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6053,14 +6026,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6068,7 +6041,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 73ed23ab681f0..3a13eecd84781 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -413,12 +413,11 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -518,9 +517,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3157,9 +3154,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -3257,9 +3252,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index ac7944f25fe37..23e4b80b61f69 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s +# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow +# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 6ead5b93a0e39..9a69c254b1000 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2770,29 +2770,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2801,7 +2799,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -2810,16 +2808,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3159,7 +3155,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo @@ -3187,9 +3182,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3209,13 +3203,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3572,7 +3564,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo @@ -3600,9 +3591,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3622,13 +3612,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3986,7 +3974,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4002,20 +3989,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4024,19 +4009,17 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4371,15 +4354,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4392,20 +4374,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4413,20 +4393,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4777,15 +4755,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4798,20 +4775,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4819,20 +4794,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5184,29 +5157,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5215,7 +5186,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5224,16 +5195,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5607,29 +5576,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5638,7 +5605,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5647,16 +5614,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6001,9 +5966,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6017,12 +5981,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6305,11 +6268,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6321,12 +6283,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6618,11 +6579,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6634,12 +6594,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6930,9 +6889,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6945,12 +6903,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7222,37 +7178,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7526,37 +7479,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7832,17 +7782,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8067,24 +8015,21 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8300,11 +8245,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8316,12 +8260,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -8613,38 +8556,35 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12279,15 +12219,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12514,15 +12452,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12752,15 +12688,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13007,21 +12941,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13233,21 +13164,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13466,21 +13394,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13718,15 +13643,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13958,22 +13881,19 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 1fc9ed70e009c..383c1031330b9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2770,29 +2770,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2801,7 +2799,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -2810,16 +2808,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3159,7 +3155,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo @@ -3187,9 +3182,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3209,13 +3203,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3572,7 +3564,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo @@ -3600,9 +3591,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3622,13 +3612,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3986,7 +3974,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4002,20 +3989,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4024,19 +4009,17 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4371,15 +4354,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4392,20 +4374,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4413,20 +4393,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4777,15 +4755,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4798,20 +4775,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4819,20 +4794,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5184,29 +5157,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5215,7 +5186,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5224,16 +5195,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5607,29 +5576,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5638,7 +5605,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5647,16 +5614,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6001,9 +5966,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6017,12 +5981,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6305,11 +6268,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6321,12 +6283,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6618,11 +6579,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6634,12 +6594,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6930,9 +6889,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6945,12 +6903,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7222,37 +7178,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7526,37 +7479,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7832,17 +7782,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8067,24 +8015,21 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8300,11 +8245,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8316,12 +8260,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -8613,38 +8556,35 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12279,15 +12219,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12514,15 +12452,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12752,15 +12688,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13007,21 +12941,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13233,21 +13164,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13466,21 +13394,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13718,15 +13643,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13958,22 +13881,19 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 52a23690dcf53..a33fd03e0ce03 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -3,8 +3,10 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL @@ -24,11 +26,34 @@ define half @test_fma(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fma: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fma: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fma: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fma: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fma: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fma: ; GFX12: ; %bb.0: @@ -57,11 +82,31 @@ define half @test_fmac(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmac: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmac: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmac: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmac: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmac: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmac: ; GFX12: ; %bb.0: @@ -98,11 +143,31 @@ define half @test_fmaak(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmaak: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmaak: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmaak: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmaak: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmaak: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmaak: ; GFX12: ; %bb.0: @@ -139,11 +204,33 @@ define half @test_fmamk(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmamk: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmamk: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmamk: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmamk: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmamk: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmamk: ; GFX12: ; %bb.0: @@ -208,33 +295,61 @@ define i32 @test_D139469_f16(half %arg) { ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_D139469_f16: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_D139469_f16: -; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 -; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e +; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_D139469_f16: ; GFX12-SDAG: ; %bb.0: ; %bb @@ -347,44 +462,83 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_D139469_v2f16: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e -; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_D139469_v2f16: -; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 -; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_D139469_v2f16: ; GFX12-SDAG: ; %bb.0: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 269fd52df5c49..31d3faf9ea83c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -887,18 +887,17 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s4, s6 +; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 @@ -1063,18 +1062,17 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s4, s6 +; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir index 08693ec9db1d4..2492eb2982aac 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir @@ -133,7 +133,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 %noninlinable, [[COPY]], 1056964608, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -152,7 +153,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], %noninlinable, 1056964608, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -210,7 +212,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], 1056964608, 1234567890, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, %noninlinable, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir index 5f985605c082d..c8afb89aa272a 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir @@ -69,3 +69,202 @@ body: | %0:sreg_32 = S_MOV_B32 63 %1:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def $scc ... + +# GCN-LABEL: name: test_no_fold_literal_already_inline_lhs{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 80 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 70, %0 +--- +name: test_no_fold_literal_already_inline_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_no_fold_literal_already_inline_rhs{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 80 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %0, 70 +--- +name: test_no_fold_literal_already_inline_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 %0, 70, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_literal_inlineimm_lhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 64, 80 +--- +name: test_fold_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 64, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_literal_inlineimm_rhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 80, 64 +--- +name: test_fold_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 %0, 64, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_2x{{$}} +# GCN: %2:sreg_32 = S_ADD_I32 70, %1 +--- +name: test_fold_same_literal_2x +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 70 + %2:sreg_32 = S_ADD_I32 %0, %1, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_lhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 70, %0 +--- +name: test_fold_same_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_ADD_I32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_rhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 %0, 70 +--- +name: test_fold_same_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_ADD_I32 %0, 70, implicit-def $scc +... + + +# GCN-LABEL: name: test_s_cselect_b32_2x_literal_fold{{$}} +# GCN: %2:sreg_32 = S_CSELECT_B32 70, %1, implicit $scc +--- +name: test_s_cselect_b32_2x_literal_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %2:sreg_32 = S_CSELECT_B32 %0, %1, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_literal_lhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 70, %0, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 70, %0, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_literal_rhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 %0, 70, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 %0, 70, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_inlineimm_lhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 64, 80, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 64, %0, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_inlineimm_rhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 80, 64, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 %0, 64, implicit $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_2x_literal_fold{{$}} +# GCN: S_CMP_EQ_U32 70, %1, implicit-def $scc +--- +name: test_s_cmp_b32_2x_literal_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + S_CMP_EQ_U32 %0, %1, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_literal_lhs{{$}} +# GCN: S_CMP_EQ_U32 70, %0, implicit-def $scc +--- +name: test_s_cmp_b32_literal_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_literal_rhs{{$}} +# GCN: S_CMP_EQ_U32 %0, 70, implicit-def $scc +--- +name: test_s_cmp_b32_literal_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 %0, 70, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_inlineimm_lhs{{$}} +# GCN: S_CMP_EQ_U32 64, 80, implicit-def $scc +--- +name: test_s_cmp_b32_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 64, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_inlineimm_rhs{{$}} +# GCN: S_CMP_EQ_U32 80, 64, implicit-def $scc +--- +name: test_s_cmp_b32_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 %0, 64, implicit-def $scc +... diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e8d73914ad302..f3c08970ae2ca 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -2991,15 +2991,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3163,15 +3161,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3336,15 +3332,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3508,21 +3502,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3670,21 +3661,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3835,21 +3823,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4001,15 +3986,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4249,15 +4232,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4425,9 +4406,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4441,12 +4421,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4779,11 +4758,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4795,12 +4773,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5144,11 +5121,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5160,12 +5136,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5508,9 +5483,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5523,12 +5497,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5849,37 +5821,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6203,37 +6172,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6559,17 +6525,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6833,24 +6797,21 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7103,11 +7064,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7119,12 +7079,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -7468,38 +7427,35 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -11675,15 +11631,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11967,15 +11921,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12261,15 +12213,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12558,21 +12508,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12839,21 +12786,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13123,21 +13067,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13416,15 +13357,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13712,22 +13651,19 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index c1c92906df250..eca7a101bb5a2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -2991,15 +2991,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3163,15 +3161,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3336,15 +3332,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3508,21 +3502,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3670,21 +3661,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3835,21 +3823,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4001,15 +3986,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4249,15 +4232,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4425,9 +4406,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4441,12 +4421,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4779,11 +4758,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4795,12 +4773,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5144,11 +5121,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5160,12 +5136,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5508,9 +5483,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5523,12 +5497,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5849,37 +5821,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6203,37 +6172,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6559,17 +6525,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6833,24 +6797,21 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7103,11 +7064,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7119,12 +7079,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -7468,38 +7427,35 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -11675,15 +11631,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11967,15 +11921,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12261,15 +12213,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12558,21 +12508,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12839,21 +12786,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13123,21 +13067,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13416,15 +13357,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13712,22 +13651,19 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 681c07db327dc..f398497e6b28f 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -784,29 +784,28 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-NEXT: ds_load_b32 v2, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 +; GFX12-NEXT: v_not_b32_e32 v3, v3 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -814,7 +813,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16: @@ -1108,12 +1107,11 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1428,12 +1426,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1730,12 +1727,10 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2034,10 +2029,9 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v1, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2276,11 +2270,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -4514,15 +4506,13 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4785,15 +4775,13 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5056,14 +5044,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5317,14 +5302,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index bf56496e98690..dc7953567450a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -784,29 +784,28 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-NEXT: ds_load_b32 v2, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 +; GFX12-NEXT: v_not_b32_e32 v3, v3 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -814,7 +813,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16: @@ -1108,12 +1107,11 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1428,12 +1426,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1730,12 +1727,10 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2034,10 +2029,9 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v1, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2276,11 +2270,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -4514,15 +4506,13 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4785,15 +4775,13 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5056,14 +5044,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5317,14 +5302,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 954dab3d0fc6f..177f98ddd0045 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -297,29 +297,15 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { ; GISEL-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GISEL-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: test_minmax_f32_ieee_true: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %minmax = call float @llvm.minnum.f32(float %max, float %c) ret float %minmax @@ -401,29 +387,15 @@ define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { ; GISEL-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; SDAG-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GISEL-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: test_maxmin_f32_ieee_true: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %maxmin = call float @llvm.maxnum.f32(float %min, float %c) ret float %maxmin @@ -638,11 +610,9 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l ; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: @@ -652,9 +622,6 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -665,10 +632,7 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: @@ -678,9 +642,6 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) @@ -782,11 +743,9 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l ; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: @@ -796,9 +755,6 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -809,10 +765,7 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: @@ -822,9 +775,6 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll index b008f397318e8..89c9801b5e466 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ ; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s ; Confirms we do not leave behind a declaration which references the same diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 0ad1c30b5b5a4..1f36101c7b53a 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -814,7 +814,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir new file mode 100644 index 0000000000000..d551ad88f56b7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir @@ -0,0 +1,242 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 + +--- +name: mad_cvv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_cvv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vcv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vvc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vsc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_cvv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vcv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vvc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vsc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_cvv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_cvv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vcv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vvc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vsc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_cvv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vcv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vvc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vsc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir new file mode 100644 index 0000000000000..89ef5df9beb8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir @@ -0,0 +1,258 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10 + +--- +name: mad_cvv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_cvv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vcv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vvc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vsc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_cvv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vcv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vvc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vsc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_cvv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_cvv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vcv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vvc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vsc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_cvv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vcv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vvc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vsc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir index 26feb8120c751..c9138dda7d1a7 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir @@ -1,17 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 --- name: mad_cvv_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_cvv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_cvv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -27,12 +20,6 @@ body: | name: mad_vcv_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vcv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vcv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -48,12 +35,6 @@ body: | name: mad_vvc_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vvc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vvc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -69,12 +50,6 @@ body: | name: mad_vsc_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vsc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vsc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -90,12 +65,6 @@ body: | name: fma_cvv_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_cvv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_cvv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -111,12 +80,6 @@ body: | name: fma_vcv_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vcv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vcv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -132,12 +95,6 @@ body: | name: fma_vvc_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vvc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vvc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -153,12 +110,6 @@ body: | name: fma_vsc_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vsc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vsc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -174,12 +125,6 @@ body: | name: mad_cvv_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_cvv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -195,12 +140,6 @@ body: | name: mad_vcv_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vcv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -216,12 +155,6 @@ body: | name: mad_vvc_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vvc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -237,12 +170,6 @@ body: | name: mad_vsc_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vsc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -258,20 +185,14 @@ body: | name: fma_cvv_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_cvv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, 18688, 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -279,20 +200,14 @@ body: | name: fma_vcv_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vcv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, 18688, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -300,20 +215,14 @@ body: | name: fma_vvc_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vvc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_lo16, $vgpr1_lo16, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -321,19 +230,13 @@ body: | name: fma_vsc_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vsc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_hi16, $vgpr1_hi16, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $sgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_hi16, 0, $vgpr1_hi16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir new file mode 100644 index 0000000000000..9af18758e2206 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=register-coalescer -o - %s | FileCheck %s + +--- +name: test_remat_v_pk_mov_b32 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_remat_v_pk_mov_b32 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0 + ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]] + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_MOV_B32_1]], 8, [[V_PK_MOV_B32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_MOV_B32_2]], 8, [[V_PK_MOV_B32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_1]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_2]] + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_MOV_B32_]] + bb.0: + liveins: $sgpr0 + %0:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + %1:vreg_64_align2 = COPY %0:vreg_64_align2 + %2:vreg_64_align2 = COPY %0:vreg_64_align2 + %3:sreg_64 = COPY $sgpr0 + $exec = S_MOV_B64_term %3:sreg_64 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %1:vreg_64_align2 = V_PK_ADD_F32 8, %1, 8, %1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %2, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + + bb.2: + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_ENDPGM 0, implicit %0 +... diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index 3f5bb166ad0e5..1ca6ebb7ddab8 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -22,8 +22,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; DXC-NEXT: Size: 24 ; DXC-NEXT: RootSignature: ; DXC-NEXT: Version: 2 -; DXC-NEXT: NumParameters: 0 -; DXC-NEXT: RootParametersOffset: 0 ; DXC-NEXT: NumStaticSamplers: 0 ; DXC-NEXT: StaticSamplersOffset: 0 +; DXC-NEXT: Parameters: [] ; DXC-NEXT: AllowInputAssemblerInputLayout: true diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index 7adb17d0b022f..616ce38144095 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -29,7 +29,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 ; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: StaticSamplersOffset: 8 ; CHECK-LABEL: Definition for 'anotherMain': ; CHECK-NEXT: Flags: 0x000002 @@ -37,4 +37,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 ; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: StaticSamplersOffset: 8 diff --git a/llvm/test/CodeGen/LoongArch/fp16-promote.ll b/llvm/test/CodeGen/LoongArch/fp16-promote.ll index c5a27a7011278..3701f0df1d2b2 100644 --- a/llvm/test/CodeGen/LoongArch/fp16-promote.ll +++ b/llvm/test/CodeGen/LoongArch/fp16-promote.ll @@ -23,12 +23,12 @@ define float @test_fpextend_float(ptr %p) nounwind { ; LA32-LABEL: test_fpextend_float: ; LA32: # %bb.0: ; LA32-NEXT: ld.hu $a0, $a0, 0 -; LA32-NEXT: b %plt(__gnu_h2f_ieee) +; LA32-NEXT: b %plt(__extendhfsf2) ; ; LA64-LABEL: test_fpextend_float: ; LA64: # %bb.0: ; LA64-NEXT: ld.hu $a0, $a0, 0 -; LA64-NEXT: b %plt(__gnu_h2f_ieee) +; LA64-NEXT: b %plt(__extendhfsf2) %a = load half, ptr %p %r = fpext half %a to float ret float %r @@ -40,7 +40,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: ld.hu $a0, $a0, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fcvt.d.s $fa0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 @@ -51,7 +51,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: ld.hu $a0, $a0, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fcvt.d.s $fa0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 @@ -68,7 +68,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill ; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -81,7 +81,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload @@ -132,12 +132,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind { ; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: move $a0, $a1 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $fp -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -152,12 +152,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind { ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: move $a0, $a1 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $fp -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload @@ -178,12 +178,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind { ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: ld.hu $s0, $a0, 0 ; LA32-NEXT: ld.hu $a0, $a1, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $s0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload @@ -202,12 +202,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind { ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: ld.hu $s0, $a0, 0 ; LA64-NEXT: ld.hu $a0, $a1, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $s0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload @@ -231,12 +231,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind { ; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: move $a0, $a1 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $fp -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -251,12 +251,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind { ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: move $a0, $a1 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $fp -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload @@ -277,12 +277,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind { ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: ld.hu $s0, $a0, 0 ; LA32-NEXT: ld.hu $a0, $a1, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $s0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload @@ -301,12 +301,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind { ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: ld.hu $s0, $a0, 0 ; LA64-NEXT: ld.hu $a0, $a1, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $s0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload @@ -327,10 +327,10 @@ define half @freeze_half_undef() nounwind { ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: movgr2fr.w $fa0, $zero -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -340,10 +340,10 @@ define half @freeze_half_undef() nounwind { ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: movgr2fr.w $fa0, $zero -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 ; LA64-NEXT: ret @@ -357,9 +357,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind { ; LA32: # %bb.0: ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -368,9 +368,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind { ; LA64: # %bb.0: ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 ; LA64-NEXT: ret @@ -384,7 +384,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: ftintrz.w.s $fa0, $fa0 ; LA32-NEXT: movfr2gr.s $a0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -395,7 +395,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.w.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.s $a0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload @@ -411,7 +411,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: ftintrz.w.s $fa0, $fa0 ; LA32-NEXT: movfr2gr.s $a0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -422,7 +422,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.w.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.s $a0, $fa0 ; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0 @@ -439,7 +439,7 @@ define i64 @test_half_to_i64(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: bl %plt(__fixsfdi) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 @@ -449,7 +449,7 @@ define i64 @test_half_to_i64(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.l.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/fp16-promote.ll b/llvm/test/CodeGen/Mips/fp16-promote.ll index 47bace9f5c03f..c03ca3a6d78dd 100644 --- a/llvm/test/CodeGen/Mips/fp16-promote.ll +++ b/llvm/test/CodeGen/Mips/fp16-promote.ll @@ -11,12 +11,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $16, $4 ; MIPS32-NEXT: lhu $4, 0($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f0, $f20 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload @@ -33,12 +33,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; MIPS64-NEXT: sd $16, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $4 ; MIPS64-NEXT: lhu $4, 0($5) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f0, $f24 ; MIPS64-NEXT: sh $2, 0($16) ; MIPS64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload @@ -59,7 +59,7 @@ define float @test_fpext_float(ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: lhu $4, 0($4) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: jr $ra @@ -70,7 +70,7 @@ define float @test_fpext_float(ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: lhu $4, 0($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; MIPS64-NEXT: jr $ra @@ -86,7 +86,7 @@ define double @test_fpext_double(ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: lhu $4, 0($4) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -98,7 +98,7 @@ define double @test_fpext_double(ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: lhu $4, 0($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload @@ -115,7 +115,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload @@ -128,7 +128,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: move $16, $5 ; MIPS64-NEXT: sh $2, 0($16) ; MIPS64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload @@ -180,18 +180,18 @@ define <4 x float> @test_vec_fpext_float(ptr %p) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $17, $4 ; MIPS32-NEXT: lhu $4, 6($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: lhu $4, 4($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: swc1 $f0, 12($17) ; MIPS32-NEXT: swc1 $f0, 8($17) ; MIPS32-NEXT: lhu $4, 2($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: swc1 $f0, 4($17) ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: swc1 $f0, 0($17) ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload @@ -209,21 +209,21 @@ define <4 x float> @test_vec_fpext_float(ptr %p) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $4 ; MIPS64-NEXT: lhu $4, 2($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: lhu $4, 6($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mfc1 $17, $f0 ; MIPS64-NEXT: mfc1 $18, $f0 ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: dsll $17, $17, 32 ; MIPS64-NEXT: mfc1 $1, $f0 ; MIPS64-NEXT: dsll $1, $1, 32 ; MIPS64-NEXT: dsrl $1, $1, 32 ; MIPS64-NEXT: or $17, $1, $17 ; MIPS64-NEXT: lhu $4, 4($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: dsll $18, $18, 32 ; MIPS64-NEXT: mfc1 $1, $f0 ; MIPS64-NEXT: dsll $1, $1, 32 @@ -251,21 +251,21 @@ define <4 x double> @test_vec_fpext_double(ptr %p) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $17, $4 ; MIPS32-NEXT: lhu $4, 6($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: lhu $4, 4($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: mov.s $f20, $f0 ; MIPS32-NEXT: lhu $4, 2($16) ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: cvt.d.s $f2, $f20 ; MIPS32-NEXT: sdc1 $f2, 24($17) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: sdc1 $f0, 16($17) ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: sdc1 $f0, 8($17) ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: sdc1 $f0, 0($17) @@ -285,21 +285,21 @@ define <4 x double> @test_vec_fpext_double(ptr %p) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $17, $4 ; MIPS64-NEXT: lhu $4, 6($5) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: move $16, $5 ; MIPS64-NEXT: lhu $4, 4($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mov.s $f24, $f0 ; MIPS64-NEXT: lhu $4, 2($16) ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: cvt.d.s $f1, $f24 ; MIPS64-NEXT: sdc1 $f1, 24($17) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sdc1 $f0, 16($17) ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: sdc1 $f0, 8($17) ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: sdc1 $f0, 0($17) @@ -326,18 +326,18 @@ define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind { ; MIPS32-NEXT: move $16, $7 ; MIPS32-NEXT: move $17, $5 ; MIPS32-NEXT: move $18, $4 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $6, $f12 ; MIPS32-NEXT: move $19, $2 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $16, $f12 ; MIPS32-NEXT: mtc1 $17, $f12 ; MIPS32-NEXT: lw $16, 56($sp) ; MIPS32-NEXT: sh $2, 6($16) -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: sh $19, 4($16) ; MIPS32-NEXT: sh $2, 2($16) -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $18, $f12 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload @@ -360,22 +360,22 @@ define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind { ; MIPS64-NEXT: move $17, $5 ; MIPS64-NEXT: move $18, $4 ; MIPS64-NEXT: sll $1, $18, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: move $19, $2 ; MIPS64-NEXT: sll $1, $17, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: dsrl $1, $17, 32 ; MIPS64-NEXT: sll $1, $1, 0 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: sh $2, 4($16) -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: sh $19, 0($16) ; MIPS64-NEXT: sh $2, 6($16) ; MIPS64-NEXT: dsrl $1, $18, 32 ; MIPS64-NEXT: sll $1, $1, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: sh $2, 2($16) ; MIPS64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload @@ -484,19 +484,19 @@ define half @test_fadd_fadd(half %a, half %b, half %c) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $16, $6 ; MIPS32-NEXT: move $17, $4 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $5 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $17 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f0, $f20 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $2 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $16 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f20, $f0 ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $17, 24($sp) # 4-byte Folded Reload @@ -514,19 +514,19 @@ define half @test_fadd_fadd(half %a, half %b, half %c) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $6 ; MIPS64-NEXT: move $17, $4 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $5, 0 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $17, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f0, $f24 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $2, 0 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $16, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f24, $f0 ; MIPS64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; MIPS64-NEXT: ld $17, 8($sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/ldexp.ll b/llvm/test/CodeGen/Mips/ldexp.ll index 3753fd567a3ed..4debc6ddce4aa 100644 --- a/llvm/test/CodeGen/Mips/ldexp.ll +++ b/llvm/test/CodeGen/Mips/ldexp.ll @@ -128,12 +128,12 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; SOFT-NEXT: .cfi_offset 31, -4 ; SOFT-NEXT: .cfi_offset 16, -8 ; SOFT-NEXT: move $16, $5 -; SOFT-NEXT: jal __gnu_h2f_ieee +; SOFT-NEXT: jal __extendhfsf2 ; SOFT-NEXT: andi $4, $4, 65535 ; SOFT-NEXT: move $4, $2 ; SOFT-NEXT: jal ldexpf ; SOFT-NEXT: move $5, $16 -; SOFT-NEXT: jal __gnu_f2h_ieee +; SOFT-NEXT: jal __truncsfhf2 ; SOFT-NEXT: move $4, $2 ; SOFT-NEXT: lw $16, 16($sp) # 4-byte Folded Reload ; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll new file mode 100644 index 0000000000000..50dc93325c286 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1 +define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2 +define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128 +define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + + +; CHECK-LABEL: test_tcgen05_cp_128x128b +define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x256b +define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b +define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b6x16_p32 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32 +define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32 +define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32 +define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32 +define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32 +define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32 +define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b4x16_p64 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64 +define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64 +define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64 +define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64 +define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64 +define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64 +define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll new file mode 100644 index 0000000000000..13a45b9d86dcf --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) +declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +; CHECK-LABEL: test_tcgen05_shift +define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) { +; CHECK-LABEL: test_tcgen05_shift( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_shift_param_0]; +; CHECK-NEXT: tcgen05.shift.cta_group::1.down [%r1]; +; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + call void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/adde_return_type.ll b/llvm/test/CodeGen/PowerPC/adde_return_type.ll index 47c5efc35afc6..7ce11079a6267 100644 --- a/llvm/test/CodeGen/PowerPC/adde_return_type.ll +++ b/llvm/test/CodeGen/PowerPC/adde_return_type.ll @@ -3,7 +3,7 @@ ; RUN: < %s -o /dev/null 2>&1 | FileCheck %s define i64 @testAddeReturnType(i64 %X, i64 %Z) { -; CHECK: Legally typed node: {{.*}}: i64,i1 = uaddo {{.*}} +; CHECK: Legally typed node: {{.*}}: i64,glue = adde {{.*}} %cmp = icmp ne i64 %Z, 0 %conv1 = zext i1 %cmp to i64 %add = add nsw i64 %conv1, %X diff --git a/llvm/test/CodeGen/PowerPC/addegluecrash.ll b/llvm/test/CodeGen/PowerPC/addegluecrash.ll index 7cd94c0e4c2d5..a711b09b9bdfd 100644 --- a/llvm/test/CodeGen/PowerPC/addegluecrash.ll +++ b/llvm/test/CodeGen/PowerPC/addegluecrash.ll @@ -9,20 +9,20 @@ define void @bn_mul_comba8(ptr nocapture %r, ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: std 4, -8(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 4, 3 ; CHECK-NEXT: ld 3, -8(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 6, 0(3) -; CHECK-NEXT: ld 11, 0(5) -; CHECK-NEXT: mulhdu 8, 11, 6 +; CHECK-NEXT: ld 9, 0(3) +; CHECK-NEXT: ld 8, 0(5) +; CHECK-NEXT: mulhdu 7, 8, 9 ; CHECK-NEXT: ld 3, 8(3) -; CHECK-NEXT: mulld 7, 3, 6 -; CHECK-NEXT: addc 9, 7, 8 -; CHECK-NEXT: ld 10, 8(5) -; CHECK-NEXT: mulhdu 5, 10, 11 -; CHECK-NEXT: mulld 10, 10, 11 -; CHECK-NEXT: addc 9, 9, 10 +; CHECK-NEXT: mulld 6, 3, 9 +; CHECK-NEXT: mulhdu 3, 3, 9 +; CHECK-NEXT: addc 6, 6, 7 +; CHECK-NEXT: addze 3, 3 +; CHECK-NEXT: ld 5, 8(5) +; CHECK-NEXT: mulld 7, 5, 8 +; CHECK-NEXT: mulhdu 5, 5, 8 +; CHECK-NEXT: addc 6, 6, 7 ; CHECK-NEXT: addze 5, 5 -; CHECK-NEXT: addc 7, 7, 8 -; CHECK-NEXT: mulhdu 3, 3, 6 -; CHECK-NEXT: adde 3, 5, 3 +; CHECK-NEXT: add 3, 5, 3 ; CHECK-NEXT: cmpld 3, 5 ; CHECK-NEXT: crmove 20, 0 ; CHECK-NEXT: li 5, 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll index aead5762d0921..501227c9072c4 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll @@ -1103,13 +1103,13 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0) ; 32BIT-NEXT: renamable $r12 = LWZ 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4) ; 32BIT-NEXT: renamable $r0 = LBZ 3, %fixed-stack.1 :: (load (s8) from %fixed-stack.1 + 3, basealign 4) - ; 32BIT-NEXT: renamable $r31 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) - ; 32BIT-NEXT: renamable $r30 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) + ; 32BIT-NEXT: renamable $r31 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) + ; 32BIT-NEXT: renamable $r30 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) ; 32BIT-NEXT: renamable $r29 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5, align 8) ; 32BIT-NEXT: renamable $r28 = LBZ 3, %fixed-stack.6 :: (load (s8) from %fixed-stack.6 + 3, basealign 4) ; 32BIT-NEXT: renamable $r27 = LHA 2, %fixed-stack.7 :: (load (s16) from %fixed-stack.7 + 2, basealign 4) - ; 32BIT-NEXT: renamable $r26 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) - ; 32BIT-NEXT: renamable $r25 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r26 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r25 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6 @@ -1120,8 +1120,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10 ; 32BIT-NEXT: renamable $r6 = SRAWI renamable $r3, 31, implicit-def dead $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r25, implicit-def $carry - ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r26, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r26, implicit-def $carry + ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r25, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r7 = SRAWI renamable $r27, 31, implicit-def dead $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r27, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r7, implicit-def dead $carry, implicit $carry @@ -1131,8 +1131,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r6, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r12, implicit-def $carry ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r6, killed renamable $r4, implicit-def dead $carry, implicit $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r30, implicit-def $carry - ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r31, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r31, implicit-def $carry + ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r30, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r0, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r4, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r4 = ADDC killed renamable $r3, killed renamable $r11, implicit-def $carry diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll index 8f33f5ef863e6..79c59e925302a 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll @@ -1213,14 +1213,14 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; ASM32PWR4-NEXT: addc 3, 3, 6 ; ASM32PWR4-NEXT: addze 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 9 -; ASM32PWR4-NEXT: lwz 7, 84(1) +; ASM32PWR4-NEXT: lwz 5, 84(1) ; ASM32PWR4-NEXT: addze 6, 6 ; ASM32PWR4-NEXT: addc 3, 3, 31 -; ASM32PWR4-NEXT: lwz 5, 80(1) +; ASM32PWR4-NEXT: lwz 7, 80(1) ; ASM32PWR4-NEXT: adde 6, 6, 30 -; ASM32PWR4-NEXT: addc 3, 3, 7 +; ASM32PWR4-NEXT: addc 3, 3, 5 ; ASM32PWR4-NEXT: lbz 8, 91(1) -; ASM32PWR4-NEXT: adde 5, 6, 5 +; ASM32PWR4-NEXT: adde 5, 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 8 ; ASM32PWR4-NEXT: lbz 6, 103(1) ; ASM32PWR4-NEXT: addze 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll index 9b1893b111556..f1bf7c262317d 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll @@ -36,17 +36,17 @@ entry: ; CHECK32: bb.0.entry: ; CHECK32-NEXT: liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10 -; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 80, %fixed-stack.0 +; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 84, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4 -; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 84, %fixed-stack.0 +; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 80, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8 ; CHECK32-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12 ; CHECK32-DAG: STW renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16 ; CHECK32-DAG: STW renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20 ; CHECK32-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24 -; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG2]], implicit-def $carry -; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG1]], implicit-def dead $carry, implicit killed $carry +; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG1]], implicit-def $carry +; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG2]], implicit-def dead $carry, implicit killed $carry ; CHECK32 STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28 ; CHECK32: BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll index 5f471ce83828a..53a7cb0aad9ee 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll @@ -325,12 +325,12 @@ define i64 @loadsTGInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 0(3) -; SMALL32-NEXT: lwz 3, 4(3) +; SMALL32-NEXT: lwz 5, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) +; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 3 -; SMALL32-NEXT: adde 3, 7, 5 +; SMALL32-NEXT: addc 4, 6, 5 +; SMALL32-NEXT: adde 3, 7, 3 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -346,14 +346,14 @@ define i64 @loadsTGInit() #1 { ; LARGE32-NEXT: lwz 3, L..C0@l(3) ; LARGE32-NEXT: lwz 4, L..C1@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 5, 0(3) -; LARGE32-NEXT: lwz 3, 4(3) -; LARGE32-NEXT: addis 4, L..C7@u(2) -; LARGE32-NEXT: lwz 4, L..C7@l(4) -; LARGE32-NEXT: lwz 6, 4(4) -; LARGE32-NEXT: lwz 7, 0(4) -; LARGE32-NEXT: addc 4, 6, 3 -; LARGE32-NEXT: adde 3, 7, 5 +; LARGE32-NEXT: lwz 4, 4(3) +; LARGE32-NEXT: lwz 3, 0(3) +; LARGE32-NEXT: addis 5, L..C7@u(2) +; LARGE32-NEXT: lwz 5, L..C7@l(5) +; LARGE32-NEXT: lwz 6, 4(5) +; LARGE32-NEXT: lwz 5, 0(5) +; LARGE32-NEXT: addc 4, 6, 4 +; LARGE32-NEXT: adde 3, 5, 3 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 @@ -589,12 +589,12 @@ define i64 @loadsTWInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 0(3) -; SMALL32-NEXT: lwz 3, 4(3) +; SMALL32-NEXT: lwz 5, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) +; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 3 -; SMALL32-NEXT: adde 3, 7, 5 +; SMALL32-NEXT: addc 4, 6, 5 +; SMALL32-NEXT: adde 3, 7, 3 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -610,14 +610,14 @@ define i64 @loadsTWInit() #1 { ; LARGE32-NEXT: lwz 3, L..C5@l(3) ; LARGE32-NEXT: lwz 4, L..C6@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 5, 0(3) -; LARGE32-NEXT: lwz 3, 4(3) -; LARGE32-NEXT: addis 4, L..C7@u(2) -; LARGE32-NEXT: lwz 4, L..C7@l(4) -; LARGE32-NEXT: lwz 6, 4(4) -; LARGE32-NEXT: lwz 7, 0(4) -; LARGE32-NEXT: addc 4, 6, 3 -; LARGE32-NEXT: adde 3, 7, 5 +; LARGE32-NEXT: lwz 4, 4(3) +; LARGE32-NEXT: lwz 3, 0(3) +; LARGE32-NEXT: addis 5, L..C7@u(2) +; LARGE32-NEXT: lwz 5, L..C7@l(5) +; LARGE32-NEXT: lwz 6, 4(5) +; LARGE32-NEXT: lwz 5, 0(5) +; LARGE32-NEXT: addc 4, 6, 4 +; LARGE32-NEXT: adde 3, 5, 3 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll index 533c866eb4e12..c2d7325107a84 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll @@ -304,15 +304,15 @@ define i64 @loadITLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C0(r2) # target-flags(ppc-tprel) @IThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -327,14 +327,14 @@ define i64 @loadITLUninit2() { ; LARGE32-NEXT: lwz r4, L..C0@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -424,15 +424,15 @@ define i64 @loadITLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C1(r2) # target-flags(ppc-tprel) @IThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -447,14 +447,14 @@ define i64 @loadITLInit2() { ; LARGE32-NEXT: lwz r4, L..C1@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -544,15 +544,15 @@ define i64 @loadTLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C2(r2) # target-flags(ppc-tprel) @ThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -567,14 +567,14 @@ define i64 @loadTLUninit2() { ; LARGE32-NEXT: lwz r4, L..C2@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -664,15 +664,15 @@ define i64 @loadTLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C3(r2) # target-flags(ppc-tprel) @ThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -687,14 +687,14 @@ define i64 @loadTLInit2() { ; LARGE32-NEXT: lwz r4, L..C3@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll index 268402170063e..6c0ea782c2a38 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll @@ -290,16 +290,16 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} bla 0 ; DIS-NEXT: {{0*}}[[#ADDR]]: R_RBA (idx: [[#NFA+1]]) .__get_tpointer[PR] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 4, 2, 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 5, 2, 0 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 8(4) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 8(5) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 0(4) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(4) -; DIS-NEXT: addc 4, 4, 3 -; DIS-NEXT: adde 3, 6, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 4(5) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(5) +; DIS-NEXT: addc 4, 6, 4 +; DIS-NEXT: adde 3, 5, 3 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 @@ -324,10 +324,10 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 12(4) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+27]]) IThreadLocalVarUninit2[TE] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) -; DIS-NEXT: addic 4, 3, 1 -; DIS-NEXT: addze 3, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) +; DIS-NEXT: addic 4, 4, 1 +; DIS-NEXT: addze 3, 3 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll index 4f00cff83942a..0ff2f28207ed4 100644 --- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll @@ -357,10 +357,10 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: .LBB7_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB7_4 Depth 2 -; CHECK-NEXT: subc 5, 6, 4 +; CHECK-NEXT: sub 5, 6, 4 +; CHECK-NEXT: cmpld 5, 6 ; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: addze. 8, 7 -; CHECK-NEXT: beq 0, .LBB7_4 +; CHECK-NEXT: bgt 0, .LBB7_4 ; CHECK-NEXT: # %bb.3: # %atomicrmw.start ; CHECK-NEXT: # ; CHECK-NEXT: mr 7, 5 diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 07bdbb25a746a..24e71c87414e8 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -476,7 +476,7 @@ define half @load_atomic_f16__seq_cst(ptr %ptr) { ; PPC32-NEXT: cmpw cr7, r3, r3 ; PPC32-NEXT: bne- cr7, .+4 ; PPC32-NEXT: isync -; PPC32-NEXT: bl __gnu_h2f_ieee +; PPC32-NEXT: bl __extendhfsf2 ; PPC32-NEXT: lwz r0, 20(r1) ; PPC32-NEXT: addi r1, r1, 16 ; PPC32-NEXT: mtlr r0 @@ -494,7 +494,7 @@ define half @load_atomic_f16__seq_cst(ptr %ptr) { ; PPC64-NEXT: cmpd cr7, r3, r3 ; PPC64-NEXT: bne- cr7, .+4 ; PPC64-NEXT: isync -; PPC64-NEXT: bl __gnu_h2f_ieee +; PPC64-NEXT: bl __extendhfsf2 ; PPC64-NEXT: nop ; PPC64-NEXT: addi r1, r1, 112 ; PPC64-NEXT: ld r0, 16(r1) @@ -582,7 +582,7 @@ define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { ; PPC32-NEXT: .cfi_offset r30, -8 ; PPC32-NEXT: stw r30, 8(r1) # 4-byte Folded Spill ; PPC32-NEXT: mr r30, r3 -; PPC32-NEXT: bl __gnu_f2h_ieee +; PPC32-NEXT: bl __truncsfhf2 ; PPC32-NEXT: sync ; PPC32-NEXT: sth r3, 0(r30) ; PPC32-NEXT: lwz r30, 8(r1) # 4-byte Folded Reload @@ -601,7 +601,7 @@ define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { ; PPC64-NEXT: .cfi_offset r30, -16 ; PPC64-NEXT: std r30, 112(r1) # 8-byte Folded Spill ; PPC64-NEXT: mr r30, r3 -; PPC64-NEXT: bl __gnu_f2h_ieee +; PPC64-NEXT: bl __truncsfhf2 ; PPC64-NEXT: nop ; PPC64-NEXT: sync ; PPC64-NEXT: sth r3, 0(r30) diff --git a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll index 29e7a16739864..34091ba46c3f6 100644 --- a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll +++ b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll @@ -12,11 +12,11 @@ define double @postinctodbl(ptr nocapture %llp) #0 { ; CHECK-NEXT: addic 4, 4, 1 ; CHECK-NEXT: lwz 5, 0(3) ; CHECK-NEXT: stw 5, 8(1) +; CHECK-NEXT: addze 5, 5 ; CHECK-NEXT: lfd 0, 8(1) -; CHECK-NEXT: stw 4, 4(3) -; CHECK-NEXT: addze 4, 5 +; CHECK-NEXT: stw 5, 0(3) ; CHECK-NEXT: fcfid 1, 0 -; CHECK-NEXT: stw 4, 0(3) +; CHECK-NEXT: stw 4, 4(3) ; CHECK-NEXT: addi 1, 1, 16 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll index 4256933300243..50f05cca80458 100644 --- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll +++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll @@ -17,7 +17,7 @@ define dso_local double @loadd(ptr nocapture readonly %a) local_unnamed_addr #0 ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 2(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -37,7 +37,7 @@ define dso_local double @loadd(ptr nocapture readonly %a) local_unnamed_addr #0 ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 2(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -61,7 +61,7 @@ define dso_local float @loadf(ptr nocapture readonly %a) local_unnamed_addr #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 2(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -81,7 +81,7 @@ define dso_local float @loadf(ptr nocapture readonly %a) local_unnamed_addr #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 2(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) @@ -130,9 +130,9 @@ define dso_local void @stored(ptr nocapture %a, double %b) local_unnamed_addr #0 ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -156,7 +156,7 @@ define dso_local void @storef(ptr nocapture %a, float %b) local_unnamed_addr #0 ; P8-NEXT: stdu r1, -48(r1) ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r3 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -179,12 +179,12 @@ define dso_local void @storef(ptr nocapture %a, float %b) local_unnamed_addr #0 ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: clrldi r3, r4, 32 ; SOFT-NEXT: std r0, 64(r1) -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -220,9 +220,9 @@ define void @test_load_store(ptr %in, ptr %out) #0 { ; SOFT-NEXT: std r0, 64(r1) ; SOFT-NEXT: mr r30, r4 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -279,7 +279,7 @@ define float @test_extend32(ptr %addr) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -298,7 +298,7 @@ define float @test_extend32(ptr %addr) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) @@ -315,7 +315,7 @@ define double @test_extend64(ptr %addr) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -334,7 +334,7 @@ define double @test_extend64(ptr %addr) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -354,7 +354,7 @@ define void @test_trunc32(float %in, ptr %addr) #0 { ; P8-NEXT: stdu r1, -48(r1) ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -377,12 +377,12 @@ define void @test_trunc32(float %in, ptr %addr) #0 { ; SOFT-NEXT: clrldi r3, r3, 32 ; SOFT-NEXT: std r0, 64(r1) ; SOFT-NEXT: mr r30, r4 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -427,9 +427,9 @@ define void @test_trunc64(double %in, ptr %addr) #0 { ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -448,7 +448,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xscvdpsxds f0, f1 ; P8-NEXT: mffprd r3, f0 @@ -472,7 +472,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __fixsfdi ; SOFT-NEXT: nop @@ -494,7 +494,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 ; P8-NEXT: xscvsxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -522,12 +522,12 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; SOFT-NEXT: bl __floatdisf ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -546,7 +546,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xscvdpuxds f0, f1 ; P8-NEXT: mffprd r3, f0 @@ -570,7 +570,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __fixunssfdi ; SOFT-NEXT: nop @@ -592,7 +592,7 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 ; P8-NEXT: xscvuxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -619,12 +619,12 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; SOFT-NEXT: mr r30, r4 ; SOFT-NEXT: bl __floatundisf ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -651,19 +651,19 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 { ; P8-NEXT: stxvd2x vs62, r1, r4 # 16-byte Folded Spill ; P8-NEXT: li r4, 80 ; P8-NEXT: stxvd2x vs63, r1, r4 # 16-byte Folded Spill -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 2(r30) ; P8-NEXT: xxlor vs63, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 4(r30) ; P8-NEXT: xxlor vs62, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 0(r30) ; P8-NEXT: xxlor vs61, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: li r3, 80 ; P8-NEXT: xxmrghd vs0, vs61, vs1 @@ -714,19 +714,19 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 { ; SOFT-NEXT: std r0, 96(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: lhz r3, 2(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: lhz r3, 4(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: lhz r3, 6(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r6, r3 ; SOFT-NEXT: mr r3, r29 @@ -759,19 +759,19 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 { ; P8-NEXT: stxvd2x vs62, r1, r4 # 16-byte Folded Spill ; P8-NEXT: li r4, 80 ; P8-NEXT: stxvd2x vs63, r1, r4 # 16-byte Folded Spill -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 4(r30) ; P8-NEXT: xxlor vs63, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 2(r30) ; P8-NEXT: xxlor vs62, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 0(r30) ; P8-NEXT: xxlor vs61, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: li r3, 80 ; P8-NEXT: xxmrghd vs35, vs63, vs62 @@ -816,25 +816,25 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 { ; SOFT-NEXT: std r0, 96(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: lhz r3, 2(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: lhz r3, 4(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: lhz r3, 6(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -870,21 +870,21 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; P8-NEXT: stxvd2x vs63, r1, r3 # 16-byte Folded Spill ; P8-NEXT: mr r30, r5 ; P8-NEXT: vmr v31, v2 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xxswapd vs0, vs63 ; P8-NEXT: mr r29, r3 ; P8-NEXT: xscvspdpn f1, vs0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xxsldwi vs0, vs63, vs63, 1 ; P8-NEXT: mr r28, r3 ; P8-NEXT: xscvspdpn f1, vs0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xscvspdpn f1, vs63 ; P8-NEXT: mr r27, r3 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 6(r30) ; P8-NEXT: li r3, 48 @@ -939,48 +939,48 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; SOFT-NEXT: mr r30, r7 ; SOFT-NEXT: mr r29, r5 ; SOFT-NEXT: mr r28, r4 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r26, r3 ; SOFT-NEXT: clrldi r3, r29, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r28, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r27, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: clrldi r3, r28, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r29, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r26, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 6(r30) ; SOFT-NEXT: mr r3, r29 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 4(r30) ; SOFT-NEXT: mr r3, r28 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 2(r30) ; SOFT-NEXT: mr r3, r27 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 80 @@ -1093,33 +1093,33 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 { ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: clrldi r3, r28, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r29, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r26, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 6(r30) ; SOFT-NEXT: mr r3, r29 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 4(r30) ; SOFT-NEXT: mr r3, r28 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 2(r30) ; SOFT-NEXT: mr r3, r27 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 80 @@ -1145,15 +1145,15 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; P8-NEXT: std r0, 80(r1) ; P8-NEXT: mr r30, r3 ; P8-NEXT: lhz r3, 0(r4) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: mtfprwa f0, r30 ; P8-NEXT: fmr f31, f1 ; P8-NEXT: xscvsxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: clrldi r3, r3, 48 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xsaddsp f1, f31, f1 ; P8-NEXT: addi r1, r1, 64 @@ -1187,17 +1187,17 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; SOFT-NEXT: std r0, 80(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r4) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: extsw r3, r30 ; SOFT-NEXT: bl __floatsisf ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r4, r3 ; SOFT-NEXT: mr r3, r29 @@ -1221,10 +1221,10 @@ define half @PR40273(half) #0 { ; P8-NEXT: mflr r0 ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: clrldi r3, r3, 48 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: fmr f0, f1 ; P8-NEXT: xxlxor f1, f1, f1 @@ -1260,7 +1260,7 @@ define half @PR40273(half) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: clrldi r3, r3, 48 ; SOFT-NEXT: std r0, 48(r1) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: li r4, 0 ; SOFT-NEXT: bl __nesf2 @@ -1268,7 +1268,7 @@ define half @PR40273(half) #0 { ; SOFT-NEXT: cmplwi r3, 0 ; SOFT-NEXT: lis r3, 16256 ; SOFT-NEXT: iseleq r3, 0, r3 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index 432b5a6b362fe..98b812e7845a5 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -412,8 +412,8 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; PPC32-NEXT: not 4, 4 ; PPC32-NEXT: not 3, 3 ; PPC32-NEXT: subc 4, 8, 4 -; PPC32-NEXT: subfe 3, 3, 7 ; PPC32-NEXT: not 6, 6 +; PPC32-NEXT: subfe 3, 3, 7 ; PPC32-NEXT: not 5, 5 ; PPC32-NEXT: subc 6, 10, 6 ; PPC32-NEXT: subfe 5, 5, 9 diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll index 5746934802eb2..8a4351b229fd1 100644 --- a/llvm/test/CodeGen/PowerPC/pr35688.ll +++ b/llvm/test/CodeGen/PowerPC/pr35688.ll @@ -8,9 +8,10 @@ define void @ec_GFp_nistp256_points_mul() { ; CHECK-LABEL: ec_GFp_nistp256_points_mul: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ld 3, 0(3) -; CHECK-NEXT: subfic 4, 3, 0 ; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: subfic 5, 3, 0 ; CHECK-NEXT: subfze 5, 4 +; CHECK-NEXT: sradi 5, 5, 63 ; CHECK-NEXT: subc 3, 5, 3 ; CHECK-NEXT: subfe 3, 4, 5 ; CHECK-NEXT: sradi 3, 3, 63 diff --git a/llvm/test/CodeGen/PowerPC/pr36292.ll b/llvm/test/CodeGen/PowerPC/pr36292.ll index 98d94646bce65..1794b3ba526ed 100644 --- a/llvm/test/CodeGen/PowerPC/pr36292.ll +++ b/llvm/test/CodeGen/PowerPC/pr36292.ll @@ -12,12 +12,11 @@ define void @test() nounwind comdat { ; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill ; CHECK-NEXT: stdu 1, -64(1) ; CHECK-NEXT: std 0, 80(1) -; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: ld 30, 32(1) -; CHECK-NEXT: subc 3, 3, 30 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: iseleq 3, 0, 3 +; CHECK-NEXT: sub 4, 3, 30 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: iselgt 3, 0, 4 ; CHECK-NEXT: addi 29, 3, 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %forcond diff --git a/llvm/test/CodeGen/PowerPC/pr40922.ll b/llvm/test/CodeGen/PowerPC/pr40922.ll index ed840ad12b7ed..9252e9a3e3aa4 100644 --- a/llvm/test/CodeGen/PowerPC/pr40922.ll +++ b/llvm/test/CodeGen/PowerPC/pr40922.ll @@ -23,10 +23,11 @@ define i32 @a() { ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: addic 6, 4, 6 -; CHECK-NEXT: addze. 5, 5 -; CHECK-NEXT: rlwinm 5, 6, 0, 28, 26 -; CHECK-NEXT: cmplw 1, 5, 4 -; CHECK-NEXT: crnand 20, 4, 2 +; CHECK-NEXT: addze 5, 5 +; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26 +; CHECK-NEXT: andi. 5, 5, 1 +; CHECK-NEXT: cmplw 1, 6, 4 +; CHECK-NEXT: crorc 20, 1, 4 ; CHECK-NEXT: bc 12, 20, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: bl e diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll index 0edbae47e9378..0f2dcb3ccc8a0 100644 --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -22,14 +22,12 @@ define hidden void @julia_tryparse_internal_45896() #0 { ; CHECK-NEXT: li r5, -3 ; CHECK-NEXT: sradi r4, r3, 63 ; CHECK-NEXT: rldic r5, r5, 4, 32 -; CHECK-NEXT: mulld r6, r4, r5 ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: mulhdu r4, r4, r5 -; CHECK-NEXT: addc r3, r3, r6 -; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: addze r3, r3 -; CHECK-NEXT: or. r3, r4, r3 -; CHECK-NEXT: beq cr0, .LBB0_9 +; CHECK-NEXT: maddld r6, r4, r5, r3 +; CHECK-NEXT: cmpld cr1, r6, r3 +; CHECK-NEXT: mulhdu. r3, r4, r5 +; CHECK-NEXT: crorc 4*cr5+lt, 4*cr1+lt, eq +; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_9 ; CHECK-NEXT: # %bb.8: # %L917 ; CHECK-NEXT: .LBB0_9: # %L994 top: diff --git a/llvm/test/CodeGen/PowerPC/pr48519.ll b/llvm/test/CodeGen/PowerPC/pr48519.ll index 002dd8f0d167a..fa156454a1313 100644 --- a/llvm/test/CodeGen/PowerPC/pr48519.ll +++ b/llvm/test/CodeGen/PowerPC/pr48519.ll @@ -20,17 +20,17 @@ define void @julia__typed_vcat_20() #0 { ; CHECK-NEXT: addi r3, r3, -1 ; CHECK-NEXT: mtfprd f0, r3 ; CHECK-NEXT: xscvsxdsp f1, f0 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r30, r30, -1 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: cmpldi r30, 0 ; CHECK-NEXT: bc 12, gt, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %bb11 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: sth r3, 0(r3) ; @@ -95,7 +95,7 @@ define void @julia__hypot_17() #0 { ; CHECK-NEXT: # %bb.2: # %bb3 ; CHECK-NEXT: # ; CHECK-NEXT: lhz r3, 0(0) -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fcmpu cr0, f1, f1 ; CHECK-NEXT: bun cr0, .LBB1_1 @@ -169,12 +169,12 @@ define void @func_48786() #0 { ; CHECK-NEXT: # %bb.3: # %bb4 ; CHECK-NEXT: # ; CHECK-NEXT: lhz r3, 0(r3) -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bc 4, 4*cr2+lt, .LBB2_6 ; CHECK-NEXT: # %bb.4: # %bb8 ; CHECK-NEXT: # -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: sth r3, 0(0) ; CHECK-NEXT: b .LBB2_1 @@ -273,7 +273,7 @@ define void @func_48785(half %arg) #0 { ; CHECK-NEXT: .LBB3_1: # %bb1 ; CHECK-NEXT: # ; CHECK-NEXT: fmr f1, f31 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r30, r30, -1 ; CHECK-NEXT: sth r3, 0(r29) diff --git a/llvm/test/CodeGen/PowerPC/pr49092.ll b/llvm/test/CodeGen/PowerPC/pr49092.ll index ea84c77603d08..7b524a6d2f69b 100644 --- a/llvm/test/CodeGen/PowerPC/pr49092.ll +++ b/llvm/test/CodeGen/PowerPC/pr49092.ll @@ -14,7 +14,7 @@ define dso_local half @test2(i64 %a, i64 %b) local_unnamed_addr #0 { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: addi r3, r3, 11 ; CHECK-NEXT: clrlwi r3, r3, 16 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r1, r1, 32 ; CHECK-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index d9b22bda85e44..8fff2c28da245 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -156,11 +156,10 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addic 3, 3, 42 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: addi 4, 3, 42 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, %a @@ -171,11 +170,10 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addic 3, 3, 42 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: addi 4, 3, 42 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, -43 @@ -348,11 +346,10 @@ define i64 @unsigned_sat_variable_i64_using_min(i64 %x, i64 %y) { define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) { ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: addc 3, 3, 4 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: add 4, 3, 4 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, %y %c = icmp ugt i64 %x, %a @@ -862,11 +859,9 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { define i64 @unsigned_sat_constant_i64_with_single_use(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_with_single_use: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 4 -; CHECK-NEXT: subc 3, 3, 4 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: iseleq 3, 0, 3 +; CHECK-NEXT: addi 4, 3, -4 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: iselgt 3, 0, 4 ; CHECK-NEXT: blr %umin = call i64 @llvm.umin.i64(i64 %x, i64 4) %sub = sub i64 %x, %umin diff --git a/llvm/test/CodeGen/PowerPC/select.ll b/llvm/test/CodeGen/PowerPC/select.ll index 10661030da8d8..289f83c475ff3 100644 --- a/llvm/test/CodeGen/PowerPC/select.ll +++ b/llvm/test/CodeGen/PowerPC/select.ll @@ -135,22 +135,18 @@ define i64 @f4_sge_0(i64 %x) { ; ; CHECK-32-LABEL: f4_sge_0: ; CHECK-32: # %bb.0: -; CHECK-32-NEXT: mr r6, r4 +; CHECK-32-NEXT: mr r5, r4 ; CHECK-32-NEXT: subfic r4, r4, 0 +; CHECK-32-NEXT: mr r6, r3 ; CHECK-32-NEXT: cmpwi r3, -1 -; CHECK-32-NEXT: subfze r5, r3 -; CHECK-32-NEXT: ble cr0, .LBB5_3 +; CHECK-32-NEXT: subfze r3, r3 +; CHECK-32-NEXT: bgt cr0, .LBB5_2 ; CHECK-32-NEXT: # %bb.1: -; CHECK-32-NEXT: ble cr0, .LBB5_4 +; CHECK-32-NEXT: mr r3, r6 ; CHECK-32-NEXT: .LBB5_2: -; CHECK-32-NEXT: mr r3, r5 -; CHECK-32-NEXT: blr -; CHECK-32-NEXT: .LBB5_3: -; CHECK-32-NEXT: mr r4, r6 -; CHECK-32-NEXT: bgt cr0, .LBB5_2 -; CHECK-32-NEXT: .LBB5_4: -; CHECK-32-NEXT: mr r5, r3 -; CHECK-32-NEXT: mr r3, r5 +; CHECK-32-NEXT: bgtlr cr0 +; CHECK-32-NEXT: # %bb.3: +; CHECK-32-NEXT: mr r4, r5 ; CHECK-32-NEXT: blr %c = icmp sge i64 %x, 0 %x.neg = sub i64 0, %x diff --git a/llvm/test/CodeGen/PowerPC/uaddo-32.ll b/llvm/test/CodeGen/PowerPC/uaddo-32.ll index 5dd5a2672b166..b5989fc2ee2da 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-32.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-32.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM -; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s define noundef i32 @add(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 6 -; LINUXASM-NEXT: stw 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: stw 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 @@ -31,22 +22,13 @@ entry: declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) define noundef zeroext i1 @add_overflow(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add_overflow: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 4, 3, 4 -; LINUXASM-NEXT: addze 3, 6 -; LINUXASM-NEXT: stw 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add_overflow: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 4, 3, 4 -; AIXASM-NEXT: li 3, 0 -; AIXASM-NEXT: addze 3, 3 -; AIXASM-NEXT: stw 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/uaddo-64.ll b/llvm/test/CodeGen/PowerPC/uaddo-64.ll index 98e834f29467c..3c7ab2c2bab79 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-64.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-64.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM -; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM +; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s define noundef i64 @add(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 6 -; LINUXASM-NEXT: std 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: std 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -31,22 +22,13 @@ entry: declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) define noundef zeroext i1 @add_overflow(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add_overflow: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 4, 3, 4 -; LINUXASM-NEXT: addze 3, 6 -; LINUXASM-NEXT: std 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add_overflow: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 4, 3, 4 -; AIXASM-NEXT: li 3, 0 -; AIXASM-NEXT: addze 3, 3 -; AIXASM-NEXT: std 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -56,28 +38,16 @@ entry: } define noundef i64 @addWithCarryIn (i64 noundef %a, i64 noundef %b, i64 noundef %c, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: addWithCarryIn: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 7, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 7 -; LINUXASM-NEXT: addc 3, 3, 5 -; LINUXASM-NEXT: addze 5, 7 -; LINUXASM-NEXT: or 4, 4, 5 -; LINUXASM-NEXT: std 4, 0(6) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .addWithCarryIn: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 7, 4 -; AIXASM-NEXT: addc 3, 3, 5 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: or 4, 7, 4 -; AIXASM-NEXT: std 4, 0(6) -; AIXASM-NEXT: blr - +; CHECK-LABEL: addWithCarryIn: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 7 +; CHECK-NEXT: addc 3, 3, 5 +; CHECK-NEXT: addze 5, 7 +; CHECK-NEXT: or 4, 4, 5 +; CHECK-NEXT: std 4, 0(6) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index f573fdab1b153..84895e74f18d5 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -5,134 +5,137 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC64-LABEL: muloti_test: ; PPC64: # %bb.0: # %start -; PPC64-NEXT: addic 9, 5, -1 -; PPC64-NEXT: mulld 10, 5, 4 -; PPC64-NEXT: mulld 11, 3, 6 -; PPC64-NEXT: subfe 9, 9, 5 -; PPC64-NEXT: add 10, 11, 10 -; PPC64-NEXT: addic 11, 3, -1 -; PPC64-NEXT: mulhdu 8, 3, 6 -; PPC64-NEXT: subfe 3, 11, 3 -; PPC64-NEXT: and 3, 3, 9 -; PPC64-NEXT: addic 9, 8, -1 -; PPC64-NEXT: subfe 8, 9, 8 -; PPC64-NEXT: or 3, 3, 8 -; PPC64-NEXT: mulhdu 5, 5, 4 ; PPC64-NEXT: addic 8, 5, -1 +; PPC64-NEXT: mulhdu 9, 5, 4 +; PPC64-NEXT: mulld 10, 5, 4 ; PPC64-NEXT: subfe 5, 8, 5 -; PPC64-NEXT: li 7, 0 -; PPC64-NEXT: or 5, 3, 5 -; PPC64-NEXT: mulhdu 8, 4, 6 -; PPC64-NEXT: addc 3, 8, 10 -; PPC64-NEXT: addze 7, 7 -; PPC64-NEXT: addic 8, 7, -1 -; PPC64-NEXT: subfe 7, 8, 7 +; PPC64-NEXT: mulld 8, 3, 6 +; PPC64-NEXT: add 8, 8, 10 +; PPC64-NEXT: addic 10, 3, -1 +; PPC64-NEXT: mulhdu 7, 3, 6 +; PPC64-NEXT: subfe 3, 10, 3 +; PPC64-NEXT: and 5, 3, 5 +; PPC64-NEXT: addic 3, 7, -1 +; PPC64-NEXT: subfe 7, 3, 7 +; PPC64-NEXT: or 5, 5, 7 +; PPC64-NEXT: mulhdu 10, 4, 6 +; PPC64-NEXT: addic 7, 9, -1 +; PPC64-NEXT: add 3, 10, 8 +; PPC64-NEXT: subfe 7, 7, 9 +; PPC64-NEXT: or 5, 5, 7 +; PPC64-NEXT: subc 7, 3, 10 +; PPC64-NEXT: subfe 7, 3, 3 +; PPC64-NEXT: neg 7, 7 ; PPC64-NEXT: or 5, 5, 7 ; PPC64-NEXT: mulld 4, 4, 6 ; PPC64-NEXT: blr ; ; PPC32-LABEL: muloti_test: ; PPC32: # %bb.0: # %start -; PPC32-NEXT: stwu 1, -64(1) -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: mfcr 12 -; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 27, 9, 4 -; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill +; PPC32-NEXT: stwu 1, -80(1) ; PPC32-NEXT: mr 11, 7 -; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: li 7, 0 -; PPC32-NEXT: mullw 26, 3, 10 -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 27, 26, 27 -; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 7, 11, 0 -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 24, 11, 6 -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 0, 8, 6 -; PPC32-NEXT: stw 12, 16(1) -; PPC32-NEXT: mr 12, 5 -; PPC32-NEXT: mulhwu 5, 4, 10 -; PPC32-NEXT: addc 5, 5, 27 -; PPC32-NEXT: addze 27, 7 -; PPC32-NEXT: cmpwi 2, 27, 0 -; PPC32-NEXT: mullw 25, 12, 8 -; PPC32-NEXT: add 26, 24, 25 -; PPC32-NEXT: addc 0, 0, 26 -; PPC32-NEXT: addze 26, 7 -; PPC32-NEXT: mullw 23, 8, 6 -; PPC32-NEXT: mullw 22, 4, 10 -; PPC32-NEXT: addc 24, 22, 23 -; PPC32-NEXT: adde 22, 5, 0 -; PPC32-NEXT: mulhwu 29, 6, 10 -; PPC32-NEXT: mullw 21, 12, 10 -; PPC32-NEXT: addc 5, 21, 29 -; PPC32-NEXT: mulhwu 30, 12, 10 -; PPC32-NEXT: addze 0, 30 -; PPC32-NEXT: mullw 23, 6, 9 -; PPC32-NEXT: addc 5, 23, 5 -; PPC32-NEXT: mulhwu 28, 6, 9 -; PPC32-NEXT: addze 29, 28 -; PPC32-NEXT: addc 0, 0, 29 -; PPC32-NEXT: addze 29, 7 -; PPC32-NEXT: mullw 30, 12, 9 -; PPC32-NEXT: addc 0, 30, 0 -; PPC32-NEXT: mulhwu 25, 12, 9 -; PPC32-NEXT: adde 30, 25, 29 -; PPC32-NEXT: addc 0, 0, 24 -; PPC32-NEXT: adde 30, 30, 22 -; PPC32-NEXT: addze. 29, 7 +; PPC32-NEXT: stw 26, 56(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu. 26, 11, 6 +; PPC32-NEXT: stw 24, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 27, 60(1) # 4-byte Folded Spill ; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: mulhwu. 29, 11, 6 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: mulhwu. 29, 12, 8 +; PPC32-NEXT: stw 19, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu 27, 6, 10 +; PPC32-NEXT: stw 20, 32(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 6, 11, 0 +; PPC32-NEXT: stw 21, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: li 7, 0 +; PPC32-NEXT: stw 22, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu. 26, 5, 8 +; PPC32-NEXT: stw 23, 44(1) # 4-byte Folded Spill ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: cmpwi 12, 0 -; PPC32-NEXT: crnor 20, 2, 30 +; PPC32-NEXT: stw 25, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 5, 0 +; PPC32-NEXT: stw 28, 64(1) # 4-byte Folded Spill +; PPC32-NEXT: mullw 24, 5, 10 +; PPC32-NEXT: stw 29, 68(1) # 4-byte Folded Spill +; PPC32-NEXT: crnor 20, 2, 26 +; PPC32-NEXT: stw 30, 72(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 3, 0 -; PPC32-NEXT: cmpwi 7, 9, 0 -; PPC32-NEXT: crnor 24, 30, 2 -; PPC32-NEXT: mulhwu. 12, 3, 10 -; PPC32-NEXT: crorc 20, 20, 26 -; PPC32-NEXT: mcrf 7, 0 +; PPC32-NEXT: stw 12, 24(1) +; PPC32-NEXT: mulhwu 30, 5, 10 +; PPC32-NEXT: cmpwi 6, 9, 0 +; PPC32-NEXT: crnor 21, 26, 2 +; PPC32-NEXT: crorc 20, 20, 6 ; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: cmpwi 26, 0 -; PPC32-NEXT: crorc 28, 20, 2 +; PPC32-NEXT: mulhwu 12, 5, 9 +; PPC32-NEXT: mullw 26, 5, 9 +; PPC32-NEXT: mullw 22, 5, 8 +; PPC32-NEXT: addc 5, 24, 27 +; PPC32-NEXT: addze 30, 30 +; PPC32-NEXT: mullw 23, 6, 9 +; PPC32-NEXT: addc 5, 23, 5 +; PPC32-NEXT: mullw 21, 11, 6 +; PPC32-NEXT: add 27, 21, 22 +; PPC32-NEXT: mulhwu 28, 8, 6 +; PPC32-NEXT: add 27, 28, 27 +; PPC32-NEXT: cmplw 7, 27, 28 +; PPC32-NEXT: mulhwu. 23, 3, 10 +; PPC32-NEXT: mcrf 6, 0 +; PPC32-NEXT: cror 24, 20, 28 +; PPC32-NEXT: crorc 25, 21, 26 +; PPC32-NEXT: mulhwu 0, 6, 9 +; PPC32-NEXT: mullw 20, 9, 4 ; PPC32-NEXT: mulhwu. 9, 9, 4 -; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: crorc 20, 24, 30 +; PPC32-NEXT: mcrf 1, 0 +; PPC32-NEXT: addze 9, 0 +; PPC32-NEXT: mullw 19, 3, 10 ; PPC32-NEXT: or. 3, 4, 3 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: or. 3, 8, 11 -; PPC32-NEXT: crorc 20, 20, 10 -; PPC32-NEXT: crnor 21, 2, 26 +; PPC32-NEXT: mcrf 5, 0 +; PPC32-NEXT: addc 3, 30, 9 +; PPC32-NEXT: add 24, 19, 20 +; PPC32-NEXT: mulhwu 29, 4, 10 +; PPC32-NEXT: add 28, 29, 24 +; PPC32-NEXT: cmplw 2, 28, 29 +; PPC32-NEXT: crorc 20, 25, 6 +; PPC32-NEXT: cror 20, 20, 8 +; PPC32-NEXT: mullw 22, 4, 10 +; PPC32-NEXT: or. 4, 8, 11 +; PPC32-NEXT: addze 4, 7 +; PPC32-NEXT: crnor 21, 2, 22 ; PPC32-NEXT: cror 20, 21, 20 -; PPC32-NEXT: cror 20, 20, 28 -; PPC32-NEXT: crandc 20, 6, 20 +; PPC32-NEXT: mullw 25, 8, 6 +; PPC32-NEXT: addc 8, 26, 3 +; PPC32-NEXT: adde 9, 12, 4 +; PPC32-NEXT: addc 3, 22, 25 +; PPC32-NEXT: adde 11, 28, 27 +; PPC32-NEXT: addc 4, 8, 3 +; PPC32-NEXT: adde 3, 9, 11 +; PPC32-NEXT: cmplw 1, 3, 9 +; PPC32-NEXT: cmplw 4, 8 +; PPC32-NEXT: crandc 22, 4, 6 ; PPC32-NEXT: mullw 6, 6, 10 -; PPC32-NEXT: bc 12, 20, .LBB0_2 +; PPC32-NEXT: bc 12, 22, .LBB0_3 ; PPC32-NEXT: # %bb.1: # %start +; PPC32-NEXT: crand 21, 6, 0 +; PPC32-NEXT: bc 12, 21, .LBB0_3 +; PPC32-NEXT: # %bb.2: # %start +; PPC32-NEXT: cror 20, 20, 24 +; PPC32-NEXT: bc 4, 20, .LBB0_4 +; PPC32-NEXT: .LBB0_3: # %start ; PPC32-NEXT: li 7, 1 -; PPC32-NEXT: .LBB0_2: # %start -; PPC32-NEXT: lwz 12, 16(1) -; PPC32-NEXT: mr 3, 30 -; PPC32-NEXT: mr 4, 0 -; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload +; PPC32-NEXT: .LBB0_4: # %start +; PPC32-NEXT: lwz 12, 24(1) +; PPC32-NEXT: lwz 30, 72(1) # 4-byte Folded Reload ; PPC32-NEXT: mtcrf 32, 12 # cr2 -; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 28, 48(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 27, 44(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 26, 40(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 25, 36(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 24, 32(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; PPC32-NEXT: addi 1, 1, 64 +; PPC32-NEXT: lwz 29, 68(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 28, 64(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 27, 60(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 26, 56(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 25, 52(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 24, 48(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 23, 44(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 22, 40(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 21, 36(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 32(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 19, 28(1) # 4-byte Folded Reload +; PPC32-NEXT: addi 1, 1, 80 ; PPC32-NEXT: blr start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll index 515dd0f70e948..e5c5356ce50a4 100644 --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -207,32 +207,33 @@ define i1 @test_urem_oversized(i66 %X) nounwind { ; PPC: # %bb.0: ; PPC-NEXT: lis 6, -12795 ; PPC-NEXT: ori 6, 6, 40665 -; PPC-NEXT: mulhwu 8, 5, 6 +; PPC-NEXT: mulhwu 7, 5, 6 ; PPC-NEXT: lis 9, 12057 ; PPC-NEXT: ori 9, 9, 37186 ; PPC-NEXT: mullw 11, 4, 6 -; PPC-NEXT: addc 8, 11, 8 +; PPC-NEXT: addc 7, 11, 7 ; PPC-NEXT: lis 11, -5526 ; PPC-NEXT: ori 11, 11, 61135 -; PPC-NEXT: mulhwu 7, 4, 6 -; PPC-NEXT: addze 7, 7 +; PPC-NEXT: mulhwu 8, 4, 6 +; PPC-NEXT: addze 8, 8 ; PPC-NEXT: mulhwu 10, 5, 9 ; PPC-NEXT: mullw 4, 4, 9 ; PPC-NEXT: mullw 9, 5, 9 -; PPC-NEXT: addc 8, 9, 8 -; PPC-NEXT: adde 7, 7, 10 -; PPC-NEXT: add 4, 4, 7 -; PPC-NEXT: rotlwi 9, 8, 31 +; PPC-NEXT: addc 7, 9, 7 +; PPC-NEXT: addze 9, 10 +; PPC-NEXT: rotlwi 10, 7, 31 ; PPC-NEXT: mullw 3, 3, 6 ; PPC-NEXT: mullw 6, 5, 6 ; PPC-NEXT: slwi 5, 5, 1 ; PPC-NEXT: add 3, 5, 3 ; PPC-NEXT: rotlwi 5, 6, 31 +; PPC-NEXT: rlwimi 5, 7, 31, 0, 0 +; PPC-NEXT: add 7, 8, 9 +; PPC-NEXT: add 4, 4, 7 ; PPC-NEXT: add 3, 4, 3 -; PPC-NEXT: rlwimi 5, 8, 31, 0, 0 -; PPC-NEXT: rlwimi 9, 3, 31, 0, 0 +; PPC-NEXT: rlwimi 10, 3, 31, 0, 0 ; PPC-NEXT: cmplw 5, 11 -; PPC-NEXT: cmplwi 1, 9, 13 +; PPC-NEXT: cmplwi 1, 10, 13 ; PPC-NEXT: rlwinm 3, 3, 31, 31, 31 ; PPC-NEXT: crandc 20, 4, 6 ; PPC-NEXT: crand 21, 6, 0 diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll index 190cf6fe1eaad..9229fefced67e 100644 --- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll +++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll @@ -17,10 +17,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; BE-NEXT: std r0, 128(r1) ; BE-NEXT: .cfi_def_cfa_offset 112 ; BE-NEXT: .cfi_offset lr, 16 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -36,10 +36,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -55,10 +55,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; FAST-NEXT: std r0, 48(r1) ; FAST-NEXT: .cfi_def_cfa_offset 32 ; FAST-NEXT: .cfi_offset lr, 16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: mffprd r3, f0 @@ -85,18 +85,18 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; BE-NEXT: fmr f31, f1 ; BE-NEXT: fmr f1, f2 ; BE-NEXT: std r30, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -129,18 +129,18 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; CHECK-NEXT: stfd f31, 88(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f2 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -172,17 +172,17 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f2 ; FAST-NEXT: std r0, 64(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: fctid f1, f30 @@ -226,34 +226,34 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; BE-NEXT: stfd f31, 200(r1) # 8-byte Folded Spill ; BE-NEXT: fmr f31, f4 ; BE-NEXT: fmr f30, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -313,34 +313,34 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; CHECK-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f4 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -394,31 +394,31 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; FAST-NEXT: std r0, 80(r1) ; FAST-NEXT: fmr f31, f3 ; FAST-NEXT: fmr f30, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f30 ; FAST-NEXT: fctid f2, f31 @@ -491,66 +491,66 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; BE-NEXT: fmr f28, f5 ; BE-NEXT: fmr f27, f4 ; BE-NEXT: fmr f26, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -664,66 +664,66 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; CHECK-NEXT: stfd f31, 232(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f8 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -821,59 +821,59 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; FAST-NEXT: fmr f27, f4 ; FAST-NEXT: fmr f26, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f26 @@ -1001,130 +1001,130 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; BE-NEXT: fmr f23, f5 ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 652(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 668(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 660(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: fmr f24, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f23, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f22, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f21, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f20, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f19, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -1343,130 +1343,130 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill ; CHECK-NEXT: li r3, 160 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 568(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 576(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 584(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r16, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r17, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -1650,115 +1650,115 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; FAST-NEXT: fmr f22, f4 ; FAST-NEXT: fmr f23, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 304(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 296(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: fmr f1, f21 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f19 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: fmr f1, f18 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: fmr f1, f17 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: fmr f1, f20 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: fmr f1, f22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: fmr f1, f23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f23 @@ -1935,272 +1935,272 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: std r3, 304(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: std r3, 296(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: std r3, 280(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: std r3, 264(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: std r3, 248(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: std r3, 232(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: std r3, 216(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: std r3, 200(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: std r3, 184(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: std r3, 168(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: std r3, 152(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1028(r1) ; BE-NEXT: std r3, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1044(r1) ; BE-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1036(r1) ; BE-NEXT: mr r15, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1060(r1) ; BE-NEXT: mr r14, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1052(r1) ; BE-NEXT: mr r31, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1076(r1) ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1068(r1) ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1092(r1) ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1084(r1) ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1108(r1) ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1100(r1) ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1124(r1) ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1116(r1) ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1140(r1) ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1132(r1) ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1156(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1148(r1) ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1172(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1164(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: stfs f1, 316(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: stfs f1, 312(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: stfs f1, 292(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: stfs f1, 276(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: stfs f1, 260(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: stfs f1, 244(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: stfs f1, 228(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: stfs f1, 212(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: stfs f1, 196(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: stfs f1, 180(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: stfs f1, 164(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: stfs f1, 148(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: stfs f1, 132(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r31, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r14, 48 ; BE-NEXT: fmr f16, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r15, 48 ; BE-NEXT: fmr f15, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f14, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f31, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 136(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f30, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 152(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f29, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 168(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f28, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 184(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f27, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 200(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f26, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 216(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f25, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 232(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f24, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 248(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f23, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 264(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f22, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 280(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f21, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 296(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f20, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 304(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f19, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -2561,274 +2561,274 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r4 # 16-byte Folded Spill ; CHECK-NEXT: li r4, 384 ; CHECK-NEXT: stvx v31, r1, r4 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: std r3, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: std r3, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: std r3, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: std r3, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: std r3, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: std r3, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: std r3, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: std r3, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: std r3, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 832(r1) ; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 840(r1) ; CHECK-NEXT: std r3, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 848(r1) ; CHECK-NEXT: mr r15, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 856(r1) ; CHECK-NEXT: mr r14, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 864(r1) ; CHECK-NEXT: mr r31, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 872(r1) ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 880(r1) ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 888(r1) ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 896(r1) ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 904(r1) ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 912(r1) ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 920(r1) ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 928(r1) ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 936(r1) ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 944(r1) ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 952(r1) ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 960(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 968(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 976(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 204 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r16, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 200 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r17, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r31, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r14, 48 ; CHECK-NEXT: fmr f16, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r15, 48 ; CHECK-NEXT: fmr f15, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 48(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f14, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 56(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f30, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 64(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v30, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 72(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v29, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 80(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v28, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 88(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v27, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 96(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v26, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 104(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v25, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v24, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v23, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 128(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v22, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 144(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v21, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 160(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v20, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 176(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f31, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -3200,238 +3200,238 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; FAST-NEXT: xxlor v31, f6, f6 ; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill ; FAST-NEXT: lfs f1, 768(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 120 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 760(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 112 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 752(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 104 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 744(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 96 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 736(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 88 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 728(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 80 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 720(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 72 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 712(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 64 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 704(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 56 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 696(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 48 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 688(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v21, f1, f1 ; FAST-NEXT: lfs f1, 680(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v20, f1, f1 ; FAST-NEXT: lfs f1, 672(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v24, f1, f1 ; FAST-NEXT: lfs f1, 664(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 656(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 648(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: lfs f1, 640(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: lfs f1, 632(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: lfs f1, 624(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: xxlor f1, v25, v25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: xxlor f1, v26, v26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: xxlor f1, v27, v27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: xxlor f1, v28, v28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: xxlor f1, v29, v29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: xxlor f1, v30, v30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: xxlor f1, v31, v31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f14 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f14, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: xxlor f1, v22, v22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: xxlor f1, v23, v23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 44 ; FAST-NEXT: fmr f15, f1 ; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f3, f15 ; FAST-NEXT: fctid f4, f17 diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll index b6d0bd5c05894..c2576d4631db8 100644 --- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll +++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll @@ -28,10 +28,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; BE-NEXT: std r0, 128(r1) ; BE-NEXT: .cfi_def_cfa_offset 112 ; BE-NEXT: .cfi_offset lr, 16 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -47,10 +47,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -66,10 +66,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; FAST-NEXT: std r0, 48(r1) ; FAST-NEXT: .cfi_def_cfa_offset 32 ; FAST-NEXT: .cfi_offset lr, 16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: mffprd r3, f0 @@ -96,18 +96,18 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; BE-NEXT: fmr f31, f1 ; BE-NEXT: fmr f1, f2 ; BE-NEXT: std r30, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -140,18 +140,18 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; CHECK-NEXT: stfd f31, 88(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f2 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -183,17 +183,17 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f2 ; FAST-NEXT: std r0, 64(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: fctid f1, f30 @@ -237,34 +237,34 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; BE-NEXT: stfd f31, 200(r1) # 8-byte Folded Spill ; BE-NEXT: fmr f31, f4 ; BE-NEXT: fmr f30, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -324,34 +324,34 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; CHECK-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f4 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -405,31 +405,31 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; FAST-NEXT: std r0, 80(r1) ; FAST-NEXT: fmr f31, f3 ; FAST-NEXT: fmr f30, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f30 ; FAST-NEXT: fctid f2, f31 @@ -502,66 +502,66 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; BE-NEXT: fmr f28, f5 ; BE-NEXT: fmr f27, f4 ; BE-NEXT: fmr f26, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -675,66 +675,66 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; CHECK-NEXT: stfd f31, 232(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f8 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -832,59 +832,59 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; FAST-NEXT: fmr f27, f4 ; FAST-NEXT: fmr f26, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f26 @@ -1012,130 +1012,130 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; BE-NEXT: fmr f23, f5 ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 652(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 668(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 660(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: fmr f24, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f23, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f22, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f21, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f20, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f19, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -1354,130 +1354,130 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill ; CHECK-NEXT: li r3, 160 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 568(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 576(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 584(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r16, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r17, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -1661,115 +1661,115 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; FAST-NEXT: fmr f22, f4 ; FAST-NEXT: fmr f23, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 304(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 296(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: fmr f1, f21 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f19 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: fmr f1, f18 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: fmr f1, f17 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: fmr f1, f20 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: fmr f1, f22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: fmr f1, f23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f23 @@ -1946,272 +1946,272 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: std r3, 304(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: std r3, 296(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: std r3, 280(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: std r3, 264(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: std r3, 248(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: std r3, 232(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: std r3, 216(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: std r3, 200(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: std r3, 184(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: std r3, 168(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: std r3, 152(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1028(r1) ; BE-NEXT: std r3, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1044(r1) ; BE-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1036(r1) ; BE-NEXT: mr r15, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1060(r1) ; BE-NEXT: mr r14, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1052(r1) ; BE-NEXT: mr r31, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1076(r1) ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1068(r1) ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1092(r1) ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1084(r1) ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1108(r1) ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1100(r1) ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1124(r1) ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1116(r1) ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1140(r1) ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1132(r1) ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1156(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1148(r1) ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1172(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1164(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: stfs f1, 316(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: stfs f1, 312(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: stfs f1, 292(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: stfs f1, 276(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: stfs f1, 260(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: stfs f1, 244(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: stfs f1, 228(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: stfs f1, 212(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: stfs f1, 196(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: stfs f1, 180(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: stfs f1, 164(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: stfs f1, 148(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: stfs f1, 132(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r31, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r14, 48 ; BE-NEXT: fmr f16, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r15, 48 ; BE-NEXT: fmr f15, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f14, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f31, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 136(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f30, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 152(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f29, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 168(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f28, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 184(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f27, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 200(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f26, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 216(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f25, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 232(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f24, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 248(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f23, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 264(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f22, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 280(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f21, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 296(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f20, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 304(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f19, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -2572,274 +2572,274 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r4 # 16-byte Folded Spill ; CHECK-NEXT: li r4, 384 ; CHECK-NEXT: stvx v31, r1, r4 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: std r3, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: std r3, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: std r3, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: std r3, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: std r3, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: std r3, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: std r3, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: std r3, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: std r3, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 832(r1) ; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 840(r1) ; CHECK-NEXT: std r3, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 848(r1) ; CHECK-NEXT: mr r15, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 856(r1) ; CHECK-NEXT: mr r14, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 864(r1) ; CHECK-NEXT: mr r31, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 872(r1) ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 880(r1) ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 888(r1) ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 896(r1) ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 904(r1) ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 912(r1) ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 920(r1) ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 928(r1) ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 936(r1) ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 944(r1) ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 952(r1) ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 960(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 968(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 976(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 204 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r16, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 200 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r17, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r31, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r14, 48 ; CHECK-NEXT: fmr f16, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r15, 48 ; CHECK-NEXT: fmr f15, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 48(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f14, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 56(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f30, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 64(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v30, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 72(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v29, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 80(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v28, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 88(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v27, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 96(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v26, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 104(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v25, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v24, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v23, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 128(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v22, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 144(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v21, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 160(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v20, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 176(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f31, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -3211,238 +3211,238 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; FAST-NEXT: xxlor v31, f6, f6 ; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill ; FAST-NEXT: lfs f1, 768(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 120 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 760(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 112 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 752(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 104 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 744(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 96 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 736(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 88 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 728(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 80 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 720(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 72 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 712(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 64 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 704(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 56 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 696(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 48 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 688(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v21, f1, f1 ; FAST-NEXT: lfs f1, 680(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v20, f1, f1 ; FAST-NEXT: lfs f1, 672(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v24, f1, f1 ; FAST-NEXT: lfs f1, 664(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 656(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 648(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: lfs f1, 640(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: lfs f1, 632(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: lfs f1, 624(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: xxlor f1, v25, v25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: xxlor f1, v26, v26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: xxlor f1, v27, v27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: xxlor f1, v28, v28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: xxlor f1, v29, v29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: xxlor f1, v30, v30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: xxlor f1, v31, v31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f14 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f14, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: xxlor f1, v22, v22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: xxlor f1, v23, v23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 44 ; FAST-NEXT: fmr f15, f1 ; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f3, f15 ; FAST-NEXT: fctid f4, f17 diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 2646dfeca4eb6..194223eee69eb 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -98,6 +98,7 @@ ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: RISC-V Vector Peephole Optimization ; CHECK-NEXT: RISC-V VMV0 Elimination +; CHECK-NEXT: RISC-V Fold Memory Offset ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll new file mode 100644 index 0000000000000..54eb3c9627691 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll @@ -0,0 +1,733 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 | FileCheck %s --check-prefixes=CHECK,RV32I +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 | FileCheck %s --check-prefixes=CHECK,RV64I +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV32ZBA +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV64ZBA + +define i64 @test_sh3add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh3add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a1, a1, a0 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a2, 480(a1) +; RV32I-NEXT: lw a1, 484(a1) +; RV32I-NEXT: lw a3, 404(a0) +; RV32I-NEXT: lw a4, 400(a0) +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a0, a4, a2 +; RV32I-NEXT: sltu a2, a0, a4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh3add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a1, a1, a0 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a1, 480(a1) +; RV64I-NEXT: ld a0, 400(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh3add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh3add a1, a1, a0 +; RV32ZBA-NEXT: sh3add a0, a2, a0 +; RV32ZBA-NEXT: lw a2, 480(a1) +; RV32ZBA-NEXT: lw a1, 484(a1) +; RV32ZBA-NEXT: lw a3, 404(a0) +; RV32ZBA-NEXT: lw a4, 400(a0) +; RV32ZBA-NEXT: add a1, a3, a1 +; RV32ZBA-NEXT: add a0, a4, a2 +; RV32ZBA-NEXT: sltu a2, a0, a4 +; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh3add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh3add a1, a1, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: ld a1, 480(a1) +; RV64ZBA-NEXT: ld a0, 400(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %b = getelementptr inbounds nuw i8, ptr %p, i64 400 + %add = add iXLen %x, 10 + %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %add + %0 = load i64, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %y + %1 = load i64, ptr %arrayidx2, align 8 + %add3 = add nsw i64 %1, %0 + ret i64 %add3 +} + +define signext i32 @test_sh2add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh2add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, 1200(a1) +; RV32I-NEXT: lw a0, 1240(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh2add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 1200(a1) +; RV64I-NEXT: lw a0, 1240(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh2add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 1200(a1) +; RV32ZBA-NEXT: lw a0, 1240(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh2add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 1200(a1) +; RV64ZBA-NEXT: lw a0, 1240(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +define signext i16 @test_sh1add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh1add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lh a1, 1600(a1) +; RV32I-NEXT: lh a0, 1620(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh1add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lh a1, 1600(a1) +; RV64I-NEXT: lh a0, 1620(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh1add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh1add a1, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a2, a0 +; RV32ZBA-NEXT: lh a1, 1600(a1) +; RV32ZBA-NEXT: lh a0, 1620(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: slli a0, a0, 16 +; RV32ZBA-NEXT: srai a0, a0, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh1add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add a1, a1, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: lh a1, 1600(a1) +; RV64ZBA-NEXT: lh a0, 1620(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: slli a0, a0, 48 +; RV64ZBA-NEXT: srai a0, a0, 48 +; RV64ZBA-NEXT: ret +entry: + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %x + %0 = load i16, ptr %arrayidx, align 2 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %add + %1 = load i16, ptr %arrayidx2, align 2 + %add4 = add i16 %1, %0 + ret i16 %add4 +} + +define zeroext i8 @test_add(ptr %p, iXLen %x, iXLen %y) { +; CHECK-LABEL: test_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 1800(a1) +; CHECK-NEXT: lbu a0, 1810(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_add: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 1800(a1) +; ZBA-NEXT: lbu a0, 1810(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} + +define i64 @test_sh3add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh3add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a2, 404(a0) +; RV32I-NEXT: lw a3, 400(a1) +; RV32I-NEXT: lw a1, 404(a1) +; RV32I-NEXT: lw a4, 400(a0) +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a0, a4, a3 +; RV32I-NEXT: sltu a2, a0, a4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh3add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 29 +; RV64I-NEXT: srli a2, a2, 29 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a1, 400(a1) +; RV64I-NEXT: ld a0, 400(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh3add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh3add a1, a1, a0 +; RV32ZBA-NEXT: sh3add a0, a2, a0 +; RV32ZBA-NEXT: lw a2, 404(a0) +; RV32ZBA-NEXT: lw a3, 400(a1) +; RV32ZBA-NEXT: lw a1, 404(a1) +; RV32ZBA-NEXT: lw a4, 400(a0) +; RV32ZBA-NEXT: add a1, a2, a1 +; RV32ZBA-NEXT: add a0, a4, a3 +; RV32ZBA-NEXT: sltu a2, a0, a4 +; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh3add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh3add.uw a1, a1, a0 +; RV64ZBA-NEXT: sh3add.uw a0, a2, a0 +; RV64ZBA-NEXT: ld a1, 400(a1) +; RV64ZBA-NEXT: ld a0, 400(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %b = getelementptr inbounds nuw i8, ptr %p, i64 400 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom + %0 = load i64, ptr %arrayidx, align 8 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom2 + %1 = load i64, ptr %arrayidx3, align 8 + %add4 = add nsw i64 %1, %0 + ret i64 %add4 +} + +define signext i32 @test_sh2add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh2add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a1, 1200(a1) +; RV32I-NEXT: lw a0, 1200(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh2add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 30 +; RV64I-NEXT: srli a2, a2, 30 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a1, 1200(a1) +; RV64I-NEXT: lw a0, 1200(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh2add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 1200(a1) +; RV32ZBA-NEXT: lw a0, 1200(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh2add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add.uw a1, a1, a0 +; RV64ZBA-NEXT: sh2add.uw a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 1200(a1) +; RV64ZBA-NEXT: lw a0, 1200(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom2 + %1 = load i32, ptr %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + ret i32 %add4 +} + +define signext i16 @test_sh1add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh1add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lh a1, 1600(a1) +; RV32I-NEXT: lh a0, 1620(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh1add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: addi a2, a2, 10 +; RV64I-NEXT: srli a1, a1, 31 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: srli a2, a2, 31 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a1, 1600(a1) +; RV64I-NEXT: lh a0, 1600(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh1add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh1add a1, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a2, a0 +; RV32ZBA-NEXT: lh a1, 1600(a1) +; RV32ZBA-NEXT: lh a0, 1620(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: slli a0, a0, 16 +; RV32ZBA-NEXT: srai a0, a0, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh1add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add.uw a1, a1, a0 +; RV64ZBA-NEXT: addi a2, a2, 10 +; RV64ZBA-NEXT: sh1add.uw a0, a2, a0 +; RV64ZBA-NEXT: lh a1, 1600(a1) +; RV64ZBA-NEXT: lh a0, 1600(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: slli a0, a0, 48 +; RV64ZBA-NEXT: srai a0, a0, 48 +; RV64ZBA-NEXT: ret +entry: + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom + %0 = load i16, ptr %arrayidx, align 2 + %add = add i32 %y, 10 + %idxprom2 = zext i32 %add to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom2 + %1 = load i16, ptr %arrayidx3, align 2 + %add5 = add i16 %1, %0 + ret i16 %add5 +} + +define zeroext i8 @test_add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lbu a1, 1800(a1) +; RV32I-NEXT: lbu a0, 1800(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: andi a0, a0, 255 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lbu a1, 1800(a1) +; RV64I-NEXT: lbu a0, 1800(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: andi a0, a0, 255 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: lbu a1, 1800(a1) +; RV32ZBA-NEXT: lbu a0, 1800(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: andi a0, a0, 255 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add.uw a1, a1, a0 +; RV64ZBA-NEXT: add.uw a0, a2, a0 +; RV64ZBA-NEXT: lbu a1, 1800(a1) +; RV64ZBA-NEXT: lbu a0, 1800(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: andi a0, a0, 255 +; RV64ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom + %0 = load i8, ptr %arrayidx, align 1 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom2 + %1 = load i8, ptr %arrayidx3, align 1 + %add5 = add i8 %1, %0 + ret i8 %add5 +} + +; The addi is part of the index and used with 2 different scales. +define signext i32 @test_scaled_index_addi(ptr %p, iXLen %x) { +; RV32I-LABEL: test_scaled_index_addi: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a2, a1, 2 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 1196(a2) +; RV32I-NEXT: lh a0, 1598(a0) +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_scaled_index_addi: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a1, 1196(a2) +; RV64I-NEXT: lh a0, 1598(a0) +; RV64I-NEXT: addw a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_scaled_index_addi: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a2, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a1, a0 +; RV32ZBA-NEXT: lw a1, 1196(a2) +; RV32ZBA-NEXT: lh a0, 1598(a0) +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_scaled_index_addi: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a2, a1, a0 +; RV64ZBA-NEXT: sh1add a0, a1, a0 +; RV64ZBA-NEXT: lw a1, 1196(a2) +; RV64ZBA-NEXT: lh a0, 1598(a0) +; RV64ZBA-NEXT: addw a0, a1, a0 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %sub = add iXLen %x, -1 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %sub + %0 = load i32, ptr %arrayidx, align 4 + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %sub + %1 = load i16, ptr %arrayidx2, align 2 + %conv = sext i16 %1 to i32 + %add = add nsw i32 %0, %conv + ret i32 %add +} + +; Offset is a pair of addis. We can fold one of them. +define signext i32 @test_medium_offset(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_medium_offset: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, 753(a1) +; RV32I-NEXT: lw a0, 793(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_medium_offset: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 753(a1) +; RV64I-NEXT: lw a0, 793(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_medium_offset: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a0, a0, 2047 +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 753(a1) +; RV32ZBA-NEXT: lw a0, 793(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_medium_offset: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a0, a0, 2047 +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 753(a1) +; RV64ZBA-NEXT: lw a0, 793(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +; Offset is a lui+addiw. We can't fold this on RV64. +define signext i32 @test_large_offset(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_large_offset: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a3, 2 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, -1392(a1) +; RV32I-NEXT: lw a0, -1352(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_large_offset: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a3, 2 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: addiw a3, a3, -1392 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 0(a1) +; RV64I-NEXT: lw a0, 40(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_large_offset: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: li a3, 1700 +; RV32ZBA-NEXT: sh2add a0, a3, a0 +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 0(a1) +; RV32ZBA-NEXT: lw a0, 40(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_large_offset: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a3, 1700 +; RV64ZBA-NEXT: sh2add a0, a3, a0 +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 0(a1) +; RV64ZBA-NEXT: lw a0, 40(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %g = getelementptr inbounds nuw i8, ptr %p, i64 6800 + %arrayidx = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +; After folding we can CSE the sh2add +define signext i32 @test_cse(ptr %p, iXLen %x) { +; RV32I-LABEL: test_cse: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 1200(a0) +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: lw a0, 753(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_cse: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a1, 1200(a0) +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: lw a0, 753(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_cse: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a0, a1, a0 +; RV32ZBA-NEXT: lw a1, 1200(a0) +; RV32ZBA-NEXT: addi a0, a0, 2047 +; RV32ZBA-NEXT: lw a0, 753(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_cse: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a0, a1, a0 +; RV64ZBA-NEXT: lw a1, 1200(a0) +; RV64ZBA-NEXT: addi a0, a0, 2047 +; RV64ZBA-NEXT: lw a0, 753(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 + %arrayidx1 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x + %1 = load i32, ptr %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + ret i32 %add +} + +define zeroext i8 @test_optsize(ptr %p, iXLen %x, iXLen %y) optsize { +; CHECK-LABEL: test_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, a0, 1800 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: lbu a0, 10(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_optsize: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: addi a0, a0, 1800 +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 0(a1) +; ZBA-NEXT: lbu a0, 10(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} + +define zeroext i8 @test_minsize(ptr %p, iXLen %x, iXLen %y) minsize { +; CHECK-LABEL: test_minsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, a0, 1800 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: lbu a0, 10(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_minsize: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: addi a0, a0, 1800 +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 0(a1) +; ZBA-NEXT: lbu a0, 10(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 585a331e55094..bef29dfecef4c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -1318,11 +1318,10 @@ define void @sqrt_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfsqrt.v v8, v10 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) ; CHECK-NEXT: ret @@ -1371,11 +1370,10 @@ define void @sqrt_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v8, v10 -; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll new file mode 100644 index 0000000000000..7c569da9291db --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64 + +define i32 @test(<8 x i1> %mask) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret + %1 = bitcast <8 x i1> %mask to i8 + %2 = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %1) + %3 = zext nneg i8 %2 to i32 + ret i32 %3 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll new file mode 100644 index 0000000000000..16c4ade7fa9cb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64 + +define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef signext %c, i32 noundef signext %n) { +; RV32-LABEL: test_store1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: blez a3, .LBB0_6 +; RV32-NEXT: # %bb.1: # %for.body.preheader +; RV32-NEXT: li a4, 8 +; RV32-NEXT: bltu a3, a4, .LBB0_7 +; RV32-NEXT: # %bb.2: # %for.body.preheader +; RV32-NEXT: sub a4, a0, a1 +; RV32-NEXT: sltu a5, a0, a1 +; RV32-NEXT: neg a5, a5 +; RV32-NEXT: sltiu a4, a4, 32 +; RV32-NEXT: seqz a5, a5 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: bnez a4, .LBB0_7 +; RV32-NEXT: # %bb.3: # %vector.ph +; RV32-NEXT: lui a5, 524288 +; RV32-NEXT: addi a5, a5, -8 +; RV32-NEXT: and a5, a3, a5 +; RV32-NEXT: li a7, 0 +; RV32-NEXT: li a6, 0 +; RV32-NEXT: .LBB0_4: # %vector.body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli t0, a7, 2 +; RV32-NEXT: addi t1, a7, 8 +; RV32-NEXT: add t0, a1, t0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (t0) +; RV32-NEXT: sltu a7, t1, a7 +; RV32-NEXT: xor t0, t1, a5 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: vmslt.vx v10, v8, a2 +; RV32-NEXT: vcompress.vm v12, v8, v10 +; RV32-NEXT: vcpop.m a7, v10 +; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV32-NEXT: vse32.v v12, (a0) +; RV32-NEXT: slli a7, a7, 2 +; RV32-NEXT: or t0, t0, a6 +; RV32-NEXT: add a0, a0, a7 +; RV32-NEXT: mv a7, t1 +; RV32-NEXT: bnez t0, .LBB0_4 +; RV32-NEXT: # %bb.5: # %middle.block +; RV32-NEXT: bne a5, a3, .LBB0_9 +; RV32-NEXT: .LBB0_6: # %for.cond.cleanup +; RV32-NEXT: ret +; RV32-NEXT: .LBB0_7: +; RV32-NEXT: li a5, 0 +; RV32-NEXT: li a4, 0 +; RV32-NEXT: j .LBB0_9 +; RV32-NEXT: .LBB0_8: # %for.inc +; RV32-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV32-NEXT: addi a5, a5, 1 +; RV32-NEXT: seqz a6, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: xor a6, a5, a3 +; RV32-NEXT: or a6, a6, a4 +; RV32-NEXT: beqz a6, .LBB0_6 +; RV32-NEXT: .LBB0_9: # %for.body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli a6, a5, 2 +; RV32-NEXT: add a6, a1, a6 +; RV32-NEXT: lw a6, 0(a6) +; RV32-NEXT: bge a6, a2, .LBB0_8 +; RV32-NEXT: # %bb.10: # %if.then +; RV32-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV32-NEXT: addi a7, a0, 4 +; RV32-NEXT: sw a6, 0(a0) +; RV32-NEXT: mv a0, a7 +; RV32-NEXT: j .LBB0_8 +; +; RV64-LABEL: test_store1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: blez a3, .LBB0_6 +; RV64-NEXT: # %bb.1: # %for.body.preheader +; RV64-NEXT: li a5, 8 +; RV64-NEXT: li a4, 0 +; RV64-NEXT: bltu a3, a5, .LBB0_7 +; RV64-NEXT: # %bb.2: # %for.body.preheader +; RV64-NEXT: sub a5, a0, a1 +; RV64-NEXT: li a6, 31 +; RV64-NEXT: bgeu a6, a5, .LBB0_7 +; RV64-NEXT: # %bb.3: # %vector.ph +; RV64-NEXT: lui a4, 524288 +; RV64-NEXT: addiw a4, a4, -8 +; RV64-NEXT: and a4, a3, a4 +; RV64-NEXT: slli a5, a4, 2 +; RV64-NEXT: add a5, a5, a1 +; RV64-NEXT: mv a6, a1 +; RV64-NEXT: .LBB0_4: # %vector.body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a6) +; RV64-NEXT: addi a6, a6, 32 +; RV64-NEXT: vmslt.vx v10, v8, a2 +; RV64-NEXT: vcompress.vm v12, v8, v10 +; RV64-NEXT: vcpop.m a7, v10 +; RV64-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV64-NEXT: vse32.v v12, (a0) +; RV64-NEXT: slli a7, a7, 2 +; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: bne a6, a5, .LBB0_4 +; RV64-NEXT: # %bb.5: # %middle.block +; RV64-NEXT: bne a4, a3, .LBB0_7 +; RV64-NEXT: .LBB0_6: # %for.cond.cleanup +; RV64-NEXT: ret +; RV64-NEXT: .LBB0_7: # %for.body.preheader13 +; RV64-NEXT: slli a4, a4, 2 +; RV64-NEXT: slli a5, a3, 2 +; RV64-NEXT: add a3, a1, a4 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: j .LBB0_9 +; RV64-NEXT: .LBB0_8: # %for.inc +; RV64-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV64-NEXT: addi a3, a3, 4 +; RV64-NEXT: beq a3, a1, .LBB0_6 +; RV64-NEXT: .LBB0_9: # %for.body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: lw a4, 0(a3) +; RV64-NEXT: bge a4, a2, .LBB0_8 +; RV64-NEXT: # %bb.10: # %if.then +; RV64-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV64-NEXT: addi a5, a0, 4 +; RV64-NEXT: sw a4, 0(a0) +; RV64-NEXT: mv a0, a5 +; RV64-NEXT: j .LBB0_8 +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %dst11 = ptrtoint ptr %dst to i64 + %src12 = ptrtoint ptr %src to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %min.iters.check = icmp ult i32 %n, 8 + %0 = sub i64 %dst11, %src12 + %diff.check = icmp ult i64 %0, 32 + %or.cond = or i1 %min.iters.check, %diff.check + br i1 %or.cond, label %for.body.preheader13, label %vector.ph + +for.body.preheader13: ; preds = %middle.block, %for.body.preheader + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %dst.addr.09.ph = phi ptr [ %dst, %for.body.preheader ], [ %monotonic.add, %middle.block ] + br label %for.body + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 2147483640 + %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %c, i64 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %monotonic.iv = phi ptr [ %dst, %vector.ph ], [ %monotonic.add, %vector.body ] + %1 = getelementptr inbounds i32, ptr %src, i64 %index + %wide.load = load <8 x i32>, ptr %1, align 4 + %2 = icmp slt <8 x i32> %wide.load, %broadcast.splat + tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %wide.load, ptr align 4 %monotonic.iv, <8 x i1> %2) + %3 = bitcast <8 x i1> %2 to i8 + %4 = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %3) + %5 = shl nuw nsw i8 %4, 2 + %6 = zext nneg i8 %5 to i64 + %monotonic.add = getelementptr inbounds i8, ptr %monotonic.iv, i64 %6 + %index.next = add nuw i64 %index, 8 + %7 = icmp eq i64 %index.next, %n.vec + br i1 %7, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13 + +for.cond.cleanup: ; preds = %for.inc, %middle.block, %entry + ret void + +for.body: ; preds = %for.body.preheader13, %for.inc + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ %indvars.iv.ph, %for.body.preheader13 ] + %dst.addr.09 = phi ptr [ %dst.addr.1, %for.inc ], [ %dst.addr.09.ph, %for.body.preheader13 ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %8 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %8, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4 + store i32 %8, ptr %dst.addr.09, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index c6ee9e34dc207..5cd9b77af82cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -5069,3 +5069,51 @@ define @vfwmaccbf16_vf( %a, bfloat %b, %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) ret %2 } + +define @vfsqrt( %a) { +; NOVLOPT-LABEL: vfsqrt: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: fsrmi a0, 0 +; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; NOVLOPT-NEXT: vfsqrt.v v10, v8 +; NOVLOPT-NEXT: fsrm a0 +; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; NOVLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; NOVLOPT-NEXT: vmv4r.v v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfsqrt: +; VLOPT: # %bb.0: +; VLOPT-NEXT: fsrmi a0, 0 +; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; VLOPT-NEXT: vfsqrt.v v10, v8 +; VLOPT-NEXT: fsrm a0 +; VLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; VLOPT-NEXT: vmv4r.v v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfsqrt.nxv4f32( poison, %a, iXLen 0, iXLen 7) + %2 = call @llvm.riscv.vfwmacc( poison, %a, %1, iXLen 7, iXLen 6, iXLen 0) + ret %2 +} + +define @vfrsqrt7( %a) { +; NOVLOPT-LABEL: vfrsqrt7: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; NOVLOPT-NEXT: vfrsqrt7.v v10, v8 +; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; NOVLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; NOVLOPT-NEXT: vmv4r.v v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfrsqrt7: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; VLOPT-NEXT: vfrsqrt7.v v10, v8 +; VLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; VLOPT-NEXT: vmv4r.v v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfrsqrt7.nxv4f32( poison, %a, iXLen 7) + %2 = call @llvm.riscv.vfwmacc( poison, %a, %1, iXLen 7, iXLen 6, iXLen 0) + ret %2 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 0475a988e9851..cb43a89ea3bc6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -141,6 +141,46 @@ body: | %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 ... --- +name: vfsqrt_nofpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfsqrt_nofpexcept + ; CHECK: %x:vrm2 = nofpexcept PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 6, 5 /* e32 */, 3 /* ta, ma */, implicit $frm + ; CHECK-NEXT: early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4 /* e16 */, 3 /* ta, ma */, implicit $frm + %x:vrm2 = nofpexcept PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5, 3, implicit $frm + early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4, 3, implicit $frm +... +--- +name: vfsqrt_fpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfsqrt_fpexcept + ; CHECK: %x:vrm2 = PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5 /* e32 */, 3 /* ta, ma */, implicit $frm + ; CHECK-NEXT: early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4 /* e16 */, 3 /* ta, ma */, implicit $frm + %x:vrm2 = PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5, 3, implicit $frm + early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4, 3, implicit $frm +... +--- +name: vfrsqrt7_nofpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfrsqrt7_nofpexcept + ; CHECK: %x:vrm2 = nofpexcept PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vrm2 = nofpexcept PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5, 0 + %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- +name: vfrsqrt7_fpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfrsqrt7_fpexcept + ; CHECK: %x:vrm2 = PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vrm2 = PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5, 0 + %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- name: vwadd_tied_vs1 body: | bb.0: diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index 8f5b044c3b3b8..cecd34956df8c 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -11,17 +11,16 @@ define void @test1(ptr %sp, ptr %t, i32 %n) { ; RV32I-LABEL: test1: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a2, 20 ; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lui a2, 20 ; RV32I-NEXT: li a3, 2 -; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a2, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) +; RV32I-NEXT: sw a3, -1920(a0) +; RV32I-NEXT: sw a2, -1916(a0) +; RV32I-NEXT: sw a2, -1920(a1) +; RV32I-NEXT: sw a3, -1916(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: @@ -58,17 +57,16 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a4, 20 -; RV32I-NEXT: addi a4, a4, -1920 ; RV32I-NEXT: add a1, a1, a4 ; RV32I-NEXT: add a0, a0, a4 ; RV32I-NEXT: blez a2, .LBB1_2 ; RV32I-NEXT: .LBB1_1: # %while_body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: addi a4, a3, 1 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a4, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) +; RV32I-NEXT: sw a4, -1920(a0) +; RV32I-NEXT: sw a3, -1916(a0) +; RV32I-NEXT: sw a4, -1920(a1) +; RV32I-NEXT: sw a3, -1916(a1) ; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: blt a4, a2, .LBB1_1 ; RV32I-NEXT: .LBB1_2: # %while_end @@ -126,11 +124,10 @@ define void @test3(ptr %t) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, 20 ; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: addi a1, a1, -1920 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a2, -1916(a0) +; RV32I-NEXT: sw a1, -1912(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test3: diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index e761fcb736a87..578f51a957a75 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -1136,10 +1136,9 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a0, a1, a0 ; RV32XTHEADMEMIDX-NEXT: lui a1, 23 -; RV32XTHEADMEMIDX-NEXT: addi a1, a1, 1792 ; RV32XTHEADMEMIDX-NEXT: add a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: lw a0, 0(a1) -; RV32XTHEADMEMIDX-NEXT: lw a1, 4(a1) +; RV32XTHEADMEMIDX-NEXT: lw a0, 1792(a1) +; RV32XTHEADMEMIDX-NEXT: lw a1, 1796(a1) ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lrd_large_offset: diff --git a/llvm/test/CodeGen/SPARC/fp16-promote.ll b/llvm/test/CodeGen/SPARC/fp16-promote.ll index a15104c7b8cff..efe67b04e8fb3 100644 --- a/llvm/test/CodeGen/SPARC/fp16-promote.ll +++ b/llvm/test/CodeGen/SPARC/fp16-promote.ll @@ -20,7 +20,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; V8-LABEL: test_fpextend_float: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __gnu_h2f_ieee +; V8-NEXT: call __extendhfsf2 ; V8-NEXT: lduh [%i0], %o0 ; V8-NEXT: ret ; V8-NEXT: restore @@ -28,7 +28,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; V9-LABEL: test_fpextend_float: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: ret ; V9-NEXT: restore @@ -36,7 +36,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; SPARC64-LABEL: test_fpextend_float: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore @@ -49,7 +49,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; V8-LABEL: test_fpextend_double: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __gnu_h2f_ieee +; V8-NEXT: call __extendhfsf2 ; V8-NEXT: lduh [%i0], %o0 ; V8-NEXT: fstod %f0, %f0 ; V8-NEXT: ret @@ -58,7 +58,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; V9-LABEL: test_fpextend_double: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: fstod %f0, %f0 ; V9-NEXT: ret @@ -67,7 +67,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; SPARC64-LABEL: test_fpextend_double: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: fstod %f0, %f0 ; SPARC64-NEXT: ret @@ -81,7 +81,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V8-OPT-LABEL: test_fpextend_fp128: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -112, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-20] ; V8-OPT-NEXT: add %fp, -16, %i0 @@ -99,7 +99,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V8-UNOPT-LABEL: test_fpextend_fp128: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -112, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-20] ; V8-UNOPT-NEXT: add %fp, -16, %i0 @@ -125,7 +125,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V9-LABEL: test_fpextend_fp128: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -112, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-20] ; V9-NEXT: add %fp, -16, %i0 @@ -143,7 +143,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; SPARC64-LABEL: test_fpextend_fp128: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: add %fp, 2031, %o0 ; SPARC64-NEXT: fmovs %f0, %f3 @@ -165,7 +165,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V8-OPT-LABEL: test_fptrunc_float: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -96, %sp -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: mov %i0, %o0 ; V8-OPT-NEXT: sth %o0, [%i1] ; V8-OPT-NEXT: ret @@ -176,7 +176,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V8-UNOPT-NEXT: save %sp, -96, %sp ; V8-UNOPT-NEXT: mov %i0, %o0 ; V8-UNOPT-NEXT: st %o0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %f0 ; V8-UNOPT-NEXT: sth %o0, [%i1] ; V8-UNOPT-NEXT: ret @@ -185,7 +185,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V9-LABEL: test_fptrunc_float: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: mov %i0, %o0 ; V9-NEXT: sth %o0, [%i1] ; V9-NEXT: ret @@ -194,7 +194,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; SPARC64-LABEL: test_fptrunc_float: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: nop ; SPARC64-NEXT: sth %o0, [%i1] ; SPARC64-NEXT: ret @@ -329,15 +329,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fadd: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-OPT-NEXT: fadds %f1, %f0, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 ; V8-OPT-NEXT: sth %o0, [%i0] ; V8-OPT-NEXT: ret @@ -346,16 +346,16 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fadd: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 ; V8-UNOPT-NEXT: fmovs %f0, %f1 ; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fadds %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %o0 ; V8-UNOPT-NEXT: sth %o0, [%i0] ; V8-UNOPT-NEXT: ret @@ -364,15 +364,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fadd: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V9-NEXT: fadds %f1, %f0, %f0 ; V9-NEXT: st %f0, [%fp+-4] -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 ; V9-NEXT: sth %o0, [%i0] ; V9-NEXT: ret @@ -381,13 +381,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fadd: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fadds %f1, %f0, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret @@ -403,15 +403,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fmul: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-OPT-NEXT: fmuls %f1, %f0, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 ; V8-OPT-NEXT: sth %o0, [%i0] ; V8-OPT-NEXT: ret @@ -420,16 +420,16 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fmul: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 ; V8-UNOPT-NEXT: fmovs %f0, %f1 ; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fmuls %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %o0 ; V8-UNOPT-NEXT: sth %o0, [%i0] ; V8-UNOPT-NEXT: ret @@ -438,15 +438,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fmul: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V9-NEXT: fmuls %f1, %f0, %f0 ; V9-NEXT: st %f0, [%fp+-4] -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 ; V9-NEXT: sth %o0, [%i0] ; V9-NEXT: ret @@ -455,13 +455,13 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fmul: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fmuls %f1, %f0, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret diff --git a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll index 4e30778d5c158..f105966bc4d08 100644 --- a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll +++ b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll @@ -26,9 +26,9 @@ define float @func_i16fp32(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 ; CHECK-NEXT: ld %s10, 8(, %s11) @@ -58,9 +58,9 @@ define double @func_i16fp64(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -91,9 +91,9 @@ define float @func_fp16fp32(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 ; CHECK-NEXT: ld %s10, 8(, %s11) @@ -123,9 +123,9 @@ define double @func_fp16fp64(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -157,9 +157,9 @@ define void @func_fp32i16(ptr %fl.ptr, float %val) { ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: or %s18, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_f2h_ieee@lo +; CHECK-NEXT: lea %s0, __truncsfhf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_f2h_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __truncsfhf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: st2b %s0, (, %s18) @@ -194,15 +194,15 @@ define half @func_fp32fp16(ptr %fl.ptr, float %a) { ; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: st %s19, 296(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: or %s18, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_f2h_ieee@lo +; CHECK-NEXT: lea %s0, __truncsfhf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_f2h_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __truncsfhf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s19, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s0, __extendhfsf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s19 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: st2b %s19, (, %s18) diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll new file mode 100644 index 0000000000000..ea4d32bae9ccb --- /dev/null +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll @@ -0,0 +1,1162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=SSE2 %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=FAST_ISEL_SSE2 %s +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=AVX512BF16 %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=FAST_ISEL_AVX512BF16 %s +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=AVXNECONVERT %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=FAST_ISEL_AVXNECONVERT %s + +define bfloat @return_arg_bf16(bfloat %x) #0 { +; SSE2-LABEL: return_arg_bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %rax +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: popq %rax +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $0, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret bfloat %x +} + +define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v2bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v2bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: addq $40, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v2bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v2bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v2bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v2bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <2 x bfloat> %x +} + +define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v3bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v3bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: addq $40, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v3bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v3bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $2, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $1, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v3bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v3bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $2, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $1, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm1, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <3 x bfloat> %x +} + +define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v4bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; FAST_ISEL_SSE2-NEXT: addq $56, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v4bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v4bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v4bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v4bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <4 x bfloat> %x +} + +define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v8bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FAST_ISEL_SSE2-NEXT: addq $56, %rsp +; FAST_ISEL_SSE2-NEXT: popq %rbx +; FAST_ISEL_SSE2-NEXT: popq %r14 +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v8bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v8bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v8bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v8bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <8 x bfloat> %x +} + +define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 { +; +; SSE2-LABEL: return_arg_v16bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v16bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $104, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: addq $104, %rsp +; FAST_ISEL_SSE2-NEXT: popq %rbx +; FAST_ISEL_SSE2-NEXT: popq %r14 +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v16bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v16bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v16bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v16bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <16 x bfloat> %x +} + +declare bfloat @returns_bf16(bfloat) +declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>) +declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>) +declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>) +declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>) +declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>) + +define bfloat @call_ret_bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE2-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %rax +; FAST_ISEL_SSE2-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: callq returns_bf16@PLT +; +; AVX512BF16-LABEL: call_ret_bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX512BF16-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVXNECONVERT-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_bf16@PLT + %val = load bfloat, ptr %ptr + call bfloat @returns_bf16(bfloat %val) + unreachable +} + +define <2 x bfloat> @call_ret_v2bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v2bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v2bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: movl (%rdi), %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, (%rsp) +; FAST_ISEL_SSE2-NEXT: movdqa (%rsp), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq returns_v2bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v2bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BF16-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v2bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v2bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v2bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVXNECONVERT-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v2bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v2bf16@PLT + %val = load <2 x bfloat>, ptr %ptr + call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val) + unreachable +} + +define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v3bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movl 4(%rdi), %eax +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v3bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax +; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx +; FAST_ISEL_SSE2-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx +; FAST_ISEL_SSE2-NEXT: shll $16, %ecx +; FAST_ISEL_SSE2-NEXT: movd %ecx, %xmm0 +; FAST_ISEL_SSE2-NEXT: shrq $32, %rax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq returns_v3bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v3bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v3bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: movq (%rdi), %rax +; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx +; FAST_ISEL_AVX512BF16-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %ecx +; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: shrq $32, %rax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm2, %eax +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v3bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v3bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: movl 4(%rdi), %eax +; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v3bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movq (%rdi), %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm2, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm0, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v3bf16@PLT + %val = load <3 x bfloat>, ptr %ptr + call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val) + unreachable +} + +define <4 x bfloat> @call_ret_v4bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v4bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; FAST_ISEL_SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; FAST_ISEL_SSE2-NEXT: callq returns_v4bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v4bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v4bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v4bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v4bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVXNECONVERT-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v4bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v4bf16@PLT + %val = load <4 x bfloat>, ptr %ptr + call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val) + unreachable +} + +define <8 x bfloat> @call_ret_v8bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v8bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1 +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FAST_ISEL_SSE2-NEXT: callq returns_v8bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v8bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BF16-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v8bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v8bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v8bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0 +; AVXNECONVERT-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v8bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v8bf16@PLT + %val = load <8 x bfloat>, ptr %ptr + call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val) + unreachable +} + +define <16 x bfloat> @call_ret_v16bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v16bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v16bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $104, %rsp +; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: callq returns_v16bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v16bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BF16-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v16bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %ymm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v16bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v16bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0 +; AVXNECONVERT-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v16bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v16bf16@PLT + %val = load <16 x bfloat>, ptr %ptr + call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val) + unreachable +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll index c6c088297c0ea..db615c8065d03 100644 --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -41,7 +41,7 @@ define void @test1(float %src, ptr %dest) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rbx ; SOFTFLOAT-NEXT: movq %rsi, %rbx -; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT +; SOFTFLOAT-NEXT: callq __truncsfhf2@PLT ; SOFTFLOAT-NEXT: movw %ax, (%rbx) ; SOFTFLOAT-NEXT: popq %rbx ; SOFTFLOAT-NEXT: retq @@ -66,7 +66,7 @@ define float @test2(ptr nocapture %src) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: popq %rcx ; SOFTFLOAT-NEXT: retq %1 = load i16, ptr %src, align 2 @@ -94,9 +94,9 @@ define float @test3(float %src) nounwind uwtable readnone { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 -; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT +; SOFTFLOAT-NEXT: callq __truncsfhf2@PLT ; SOFTFLOAT-NEXT: movzwl %ax, %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: popq %rcx ; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq @@ -126,7 +126,7 @@ define double @test4(ptr nocapture %src) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: movl %eax, %edi ; SOFTFLOAT-NEXT: callq __extendsfdf2@PLT ; SOFTFLOAT-NEXT: popq %rcx diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll index 24dabfc18b9e3..16ebf70126f8b 100644 --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -124,13 +124,13 @@ define dso_local float @div_arcp_by_const(half %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; X86-NEXT: fstps (%esp) -; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: calll __truncsfhf2 ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: popl %eax ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fp-i129.ll b/llvm/test/CodeGen/X86/fp-i129.ll index 97116ce4e621f..c55c19abbd9b8 100644 --- a/llvm/test/CodeGen/X86/fp-i129.ll +++ b/llvm/test/CodeGen/X86/fp-i129.ll @@ -96,7 +96,7 @@ define i257 @fptosi257_double(double %a) nounwind { ; half tests define i257 @fptosi_half(half %a) nounwind { ; X86-LABEL: fptosi_half: -; X86: __gnu_h2f_ieee +; X86: __extendhfsf2 ; ; X64-LABEL: fptosi_half: ; X64: __extendhfsf2 @@ -106,7 +106,7 @@ define i257 @fptosi_half(half %a) nounwind { define half @uitofp_half(i257 %a) nounwind { ; X86-LABEL: uitofp_half: -; X86: __gnu_f2h_ieee +; X86: __truncsfhf2 ; ; X64-LABEL: uitofp_half: ; X64: __truncsfhf2 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll index f141153d059ac..707b05f3478db 100644 --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -28,7 +28,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp { ; X64-AVX512-LABEL: TestFPExtF16_F128: ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: pushq %rax -; X64-AVX512-NEXT: vmovsh vf16(%rip), %xmm0 +; X64-AVX512-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: callq __extendhftf2@PLT ; X64-AVX512-NEXT: vmovaps %xmm0, vf128(%rip) ; X64-AVX512-NEXT: popq %rax @@ -40,7 +40,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp { ; X86-NEXT: subl $40, %esp ; X86-NEXT: movzwl vf16, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: wait ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll index 04fce7badb951..85f4c945230e1 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -2060,7 +2060,7 @@ define i1 @test_signed_i1_f16(half %f) nounwind { ; X86-X87-NEXT: subl $24, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2158,7 +2158,7 @@ define i8 @test_signed_i8_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2253,7 +2253,7 @@ define i13 @test_signed_i13_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2349,7 +2349,7 @@ define i16 @test_signed_i16_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2445,7 +2445,7 @@ define i19 @test_signed_i19_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2538,7 +2538,7 @@ define i32 @test_signed_i32_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2633,7 +2633,7 @@ define i50 @test_signed_i50_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2758,7 +2758,7 @@ define i64 @test_signed_i64_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2885,7 +2885,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) @@ -3064,7 +3064,7 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll index fefc92c313511..47dc3ca3616ea 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -1883,7 +1883,7 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1965,7 +1965,7 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2047,7 +2047,7 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2128,7 +2128,7 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2209,7 +2209,7 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind { ; X86-X87-NEXT: subl $28, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2295,7 +2295,7 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind { ; X86-X87-NEXT: subl $28, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2382,7 +2382,7 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind { ; X86-X87-NEXT: subl $24, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fxch %st(1) ; X86-X87-NEXT: fucom %st(1) @@ -2526,7 +2526,7 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fxch %st(1) ; X86-X87-NEXT: fucom %st(1) @@ -2667,7 +2667,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: subl $44, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) @@ -2821,7 +2821,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index 35d16c3bac70d..959265d08299a 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -82,7 +82,7 @@ define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: callq fmodf128 +; CHECK-NEXT: callq fmodf128@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rbx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll index 0f73129d984bd..f1874cc03000a 100644 --- a/llvm/test/CodeGen/X86/half-constrained.ll +++ b/llvm/test/CodeGen/X86/half-constrained.ll @@ -15,7 +15,7 @@ define float @half_to_float() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -64,7 +64,7 @@ define double @half_to_double() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -116,7 +116,7 @@ define x86_fp80 @half_to_fp80() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -166,7 +166,7 @@ define void @float_to_half(float %0) strictfp { ; X86-NOF16C-NEXT: flds {{[0-9]+}}(%esp) ; X86-NOF16C-NEXT: fstps (%esp) ; X86-NOF16C-NEXT: wait -; X86-NOF16C-NEXT: calll __gnu_f2h_ieee +; X86-NOF16C-NEXT: calll __truncsfhf2 ; X86-NOF16C-NEXT: movw %ax, a ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 @@ -324,17 +324,17 @@ define void @add() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NOF16C-NEXT: wait ; X86-NOF16C-NEXT: movzwl b, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NOF16C-NEXT: faddp %st, %st(1) ; X86-NOF16C-NEXT: fstps (%esp) ; X86-NOF16C-NEXT: wait -; X86-NOF16C-NEXT: calll __gnu_f2h_ieee +; X86-NOF16C-NEXT: calll __truncsfhf2 ; X86-NOF16C-NEXT: movw %ax, c ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll index 3c6e14598571d..859139463b7e3 100644 --- a/llvm/test/CodeGen/X86/ldexp.ll +++ b/llvm/test/CodeGen/X86/ldexp.ll @@ -608,14 +608,14 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) ; WIN32-NEXT: calll _ldexp ; WIN32-NEXT: fstps {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: addl $16, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll index 96de34519556d..8436c1052552e 100644 --- a/llvm/test/CodeGen/X86/llvm.frexp.ll +++ b/llvm/test/CodeGen/X86/llvm.frexp.ll @@ -45,7 +45,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) @@ -54,7 +54,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: movl %esi, %edx ; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: popl %esi @@ -95,7 +95,7 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) @@ -103,7 +103,7 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; WIN32-NEXT: fstps {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: retl %result = call { half, i32 } @llvm.frexp.f16.i32(half %a) @@ -146,7 +146,7 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; WIN32-NEXT: subl $16, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index be83db26aa7ed..89ed0040a71c2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1215,10 +1215,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -1294,10 +1294,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index ba51c65ccab13..75c470a6d40c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1161,23 +1161,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1231,23 +1231,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2126,41 +2126,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2231,41 +2230,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 @@ -6905,7 +6903,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -6927,7 +6925,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload @@ -6944,7 +6942,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -6968,7 +6966,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 @@ -7035,7 +7033,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero @@ -7057,7 +7055,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 @@ -7070,7 +7068,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 @@ -7083,7 +7081,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -7589,7 +7587,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -7611,7 +7609,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload @@ -7628,7 +7626,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -7652,7 +7650,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 @@ -7719,7 +7717,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero @@ -7741,7 +7739,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 @@ -7754,7 +7752,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 @@ -7767,7 +7765,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a8df418143f32..717d1e447e165 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -98,8 +98,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -110,8 +109,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -122,8 +120,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -145,8 +142,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} @@ -162,8 +158,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} @@ -176,8 +171,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 @@ -200,21 +194,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) @@ -227,21 +220,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) @@ -249,47 +241,25 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: -; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] -; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-ONLY-NEXT: vzeroupper -; AVX512BW-ONLY-NEXT: retq -; -; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: -; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vzeroupper -; AVX512VBMI-ONLY-NEXT: retq +; AVX512BW-LABEL: mask_replication_factor2_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> @@ -301,42 +271,41 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5 -; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm4 +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k4} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k5} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) @@ -351,41 +320,40 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-LABEL: mask_replication_factor2_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k5 -; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm3, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 +; AVX512DQ-NEXT: vpmovsxdq %ymm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) @@ -402,12 +370,9 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] -; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -441,8 +406,7 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2525,8 +2489,7 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 @@ -2598,47 +2561,25 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: -; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] -; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-ONLY-NEXT: vzeroupper -; AVX512BW-ONLY-NEXT: retq -; -; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: -; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vzeroupper -; AVX512VBMI-ONLY-NEXT: retq +; AVX512BW-LABEL: mask_replication_factor4_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> @@ -2747,11 +2688,9 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2785,8 +2724,7 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2997,8 +2935,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -3060,8 +2997,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -12956,8 +12892,7 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY: # %bb.0: ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} @@ -13083,10 +13018,10 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -13291,13 +13226,12 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k3 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 ; AVX512BW-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -13680,16 +13614,16 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -13710,9 +13644,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm11, %k1 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 @@ -13735,8 +13669,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm10, %k2 +; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 @@ -13765,7 +13699,7 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) @@ -13775,9 +13709,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 512(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index da65fecba773b..d6208aca3b2b7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1654,12 +1654,27 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> @@ -1667,12 +1682,34 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { } define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_1032_v2i64: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index f7c29cba30bd5..6b1d118ca97ad 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -6173,13 +6173,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6209,13 +6209,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3] +; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index b0a3e6945f454..0d7902afdaa66 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -14,10 +14,9 @@ Parts: Size: 24 RootSignature: Version: 2 - NumParameters: 1 - RootParametersOffset: 3 NumStaticSamplers: 4 StaticSamplersOffset: 5 + Parameters: [] AllowInputAssemblerInputLayout: true DenyGeometryShaderRootAccess: true @@ -25,9 +24,8 @@ Parts: # CHECK-NEXT: Size: 24 # CHECK-NEXT: RootSignature: # CHECK-NEXT: Version: 2 -# CHECK-NEXT: NumParameters: 1 -# CHECK-NEXT: RootParametersOffset: 3 -# CHECK-NEXT: NumStaticSamplers: 4 -# CHECK-NEXT: StaticSamplersOffset: 5 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 0 +# CHECK-NEXT: Parameters: [] # CHECK-NEXT: AllowInputAssemblerInputLayout: true # CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml new file mode 100644 index 0000000000000..8d5ab5c1b0b23 --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml @@ -0,0 +1,55 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 80 + RootSignature: + Version: 2 + NumStaticSamplers: 0 + StaticSamplersOffset: 64 + Parameters: + - ParameterType: Constants32Bit + ShaderVisibility: Hull + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + - ParameterType: Constants32Bit + ShaderVisibility: Geometry + Constants: + Num32BitValues: 21 + ShaderRegister: 22 + RegisterSpace: 23 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + +# CHECK: - Name: RTS0 +# CHECK-NEXT: Size: 80 +# CHECK-NEXT: RootSignature: +# CHECK-NEXT: Version: 2 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 0 +# CHECK-NEXT: Parameters: +# CHECK-NEXT: - ParameterType: Constants32Bit +# CHECK-NEXT: ShaderVisibility: Hull +# CHECK-NEXT: Constants: +# CHECK-NEXT: Num32BitValues: 16 +# CHECK-NEXT: RegisterSpace: 14 +# CHECK-NEXT: ShaderRegister: 15 +# CHECK-NEXT: - ParameterType: Constants32Bit +# CHECK-NEXT: ShaderVisibility: Geometry +# CHECK-NEXT: Constants: +# CHECK-NEXT: Num32BitValues: 21 +# CHECK-NEXT: RegisterSpace: 23 +# CHECK-NEXT: ShaderRegister: 22 +# CHECK-NEXT: AllowInputAssemblerInputLayout: true +# CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll index 4b2b5490bc2cb..e1a9084b583b9 100644 --- a/llvm/test/ThinLTO/X86/memprof-recursive.ll +++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll @@ -3,12 +3,15 @@ ;; See llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll for ;; information on how the test was created. +;; -stats requires asserts +; REQUIRES: asserts + ; RUN: opt -thinlto-bc %s >%t.o ;; Check behavior when we enable cloning of contexts involved with recursive -;; cycles, but not through the cycle itself. I.e. until full support for -;; recursion is added, the cloned recursive call from C back to B (line 12) will -;; not be updated to call a clone. +;; cycles, but not through the cycle itself. I.e. with full support for cloning +;; recursive cycles off, the cloned recursive call from C back to B (line 12) +;; will not be updated to call a clone. ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t.o,_Z1Dv,plx \ @@ -19,6 +22,7 @@ ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: -o %t.out 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS @@ -39,7 +43,7 @@ ; RUN: --implicit-check-not="created clone" \ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" -;; Check the default behavior (enabled recursive callsites). +;; Check the default behavior (clone recursive callsites). ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t.o,_Z1Dv,plx \ @@ -47,11 +51,11 @@ ; RUN: -r=%t.o,_Z1Bi,plx \ ; RUN: -r=%t.o,main,plx \ ; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -o %t.out 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ -; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS +; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS \ +; RUN: --check-prefix=CLONE-RECUR-CALLSITES ;; Skipping recursive contexts should prevent spurious call to cloned version of ;; B from the context starting at memprof_recursive.cc:19:13, which is actually @@ -67,6 +71,7 @@ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ ; RUN: -memprof-allow-recursive-contexts=false \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: -o %t.out 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS @@ -76,6 +81,7 @@ ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1 +; CLONE-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1 ;; We should only call the cold clone for the recursive context if we enabled @@ -83,6 +89,7 @@ ; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 ; SKIP-RECUR-CONTEXTS-NOT: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 +; CLONE-RECUR-CALLSITES: 1 memprof-context-disambiguation - Number of backedges with deferred cloning target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll index a3b065667702f..f706184f9727e 100644 --- a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll +++ b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll @@ -14,7 +14,7 @@ define ptr @b(ptr %q) { ret ptr %tmp } -; CHECK: define ptr @c(ptr readnone returned captures(address_is_null, ret: address, provenance) %r) +; CHECK: define ptr @c(ptr readnone returned %r) @g = global i32 0 define ptr @c(ptr %r) { %a = icmp eq ptr %r, null diff --git a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll index 99406696d33d1..13954694eefe0 100644 --- a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll +++ b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll @@ -145,8 +145,8 @@ return: ; preds = %cond.end, %if.then3 ; TEST SCC test returning a pointer value argument ; -; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned captures(ret: address, provenance) %r) -; FNATTR: define ptr @ptr_scc_r1(ptr readnone %a, ptr readnone %r, ptr readnone captures(none) %b) +; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned %r) +; FNATTR: define ptr @ptr_scc_r1(ptr %a, ptr readnone %r, ptr readnone captures(none) %b) ; FNATTR: define ptr @ptr_scc_r2(ptr readnone %a, ptr readnone %b, ptr readnone %r) ; ; @@ -260,8 +260,8 @@ entry: ; TEST another SCC test ; -; FNATTR: define ptr @rt2_helper(ptr readnone captures(address_is_null) %a) -; FNATTR: define ptr @rt2(ptr readnone captures(address_is_null) %a, ptr readnone captures(ret: address, provenance) %b) +; FNATTR: define ptr @rt2_helper(ptr %a) +; FNATTR: define ptr @rt2(ptr readnone %a, ptr readnone %b) define ptr @rt2_helper(ptr %a) #0 { entry: %call = call ptr @rt2(ptr %a, ptr %a) @@ -284,8 +284,8 @@ if.end: ; TEST another SCC test ; -; FNATTR: define ptr @rt3_helper(ptr readnone captures(address_is_null) %a, ptr readnone %b) -; FNATTR: define ptr @rt3(ptr readnone captures(address_is_null) %a, ptr readnone %b) +; FNATTR: define ptr @rt3_helper(ptr %a, ptr %b) +; FNATTR: define ptr @rt3(ptr readnone %a, ptr readnone %b) define ptr @rt3_helper(ptr %a, ptr %b) #0 { entry: %call = call ptr @rt3(ptr %a, ptr %b) @@ -316,7 +316,7 @@ if.end: ; } ; ; -; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned captures(ret: address, provenance) %r) +; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned %r) declare void @unknown_fn(ptr) #0 define ptr @calls_unknown_fn(ptr %r) #0 { @@ -415,7 +415,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @bitcast(ptr readnone returned captures(ret: address, provenance) %b) +; FNATTR: define ptr @bitcast(ptr readnone returned %b) ; define ptr @bitcast(ptr %b) #0 { entry: @@ -433,7 +433,7 @@ entry: ; } ; ; -; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone %b) ; define ptr @bitcasts_select_and_phi(ptr %b) #0 { entry: @@ -462,7 +462,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone %b) ; define ptr @ret_arg_arg_undef(ptr %b) #0 { entry: @@ -494,7 +494,7 @@ ret_undef: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone %b) ; define ptr @ret_undef_arg_arg(ptr %b) #0 { entry: @@ -526,7 +526,7 @@ ret_arg1: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone %b) define ptr @ret_undef_arg_undef(ptr %b) #0 { entry: %cmp = icmp eq ptr %b, null diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index 6debe5de3966e..6164f2adbf5b9 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -7,7 +7,7 @@ define ptr @c1(ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @c1 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[Q:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned [[Q:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[Q]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) @@ -512,7 +512,7 @@ define void @test4_1(ptr %x4_1, i1 %c) { define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @test4_2 -; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned captures(ret: address, provenance) [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { +; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; FNATTRS: t: ; FNATTRS-NEXT: call void @test4_1(ptr null, i1 [[C]]) @@ -740,7 +740,7 @@ define void @captureStrip(ptr %p) { define i1 @captureICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmp -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -757,7 +757,7 @@ define i1 @captureICmp(ptr %x) { define i1 @captureICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmpRev -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr null, [[X]] ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -771,29 +771,10 @@ define i1 @captureICmpRev(ptr %x) { ret i1 %1 } -define i1 @captureICmpWrongPred(ptr %x) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @captureICmpWrongPred -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null -; FNATTRS-NEXT: ret i1 [[TMP1]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @captureICmpWrongPred -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] -; - %1 = icmp slt ptr %x, null - ret i1 %1 -} - -; We could infer captures(address_is_null) here, but don't bother, because -; InstCombine will optimize the GEP away. define i1 @nocaptureInboundsGEPICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -813,7 +794,7 @@ define i1 @nocaptureInboundsGEPICmp(ptr %x) { define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmpRev -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr null, [[TMP1]] ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -830,46 +811,6 @@ define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ret i1 %2 } -define i1 @notInboundsGEPICmp(ptr %x) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @notInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; FNATTRS-NEXT: ret i1 [[TMP2]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @notInboundsGEPICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] -; - %1 = getelementptr i32, ptr %x, i32 5 - %2 = icmp eq ptr %1, null - ret i1 %2 -} - -define i1 @inboundsGEPICmpNullPointerDefined(ptr %x) null_pointer_is_valid { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) -; FNATTRS-LABEL: define i1 @inboundsGEPICmpNullPointerDefined -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR16:[0-9]+]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; FNATTRS-NEXT: ret i1 [[TMP2]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @inboundsGEPICmpNullPointerDefined -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR12:[0-9]+]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] -; - %1 = getelementptr i32, ptr %x, i32 5 - %2 = icmp eq ptr %1, null - ret i1 %2 -} - define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @nocaptureDereferenceableOrNullICmp @@ -890,13 +831,13 @@ define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) define i1 @captureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) null_pointer_is_valid { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @captureDereferenceableOrNullICmp -; FNATTRS-SAME: (ptr readnone captures(address_is_null) dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16]] { +; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; ATTRIBUTOR-LABEL: define i1 @captureDereferenceableOrNullICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12]] { +; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] ; @@ -962,7 +903,7 @@ define void @readnone_indirec(ptr %f, ptr %p) { define ptr @captures_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @captures_ret_only -; FNATTRS-SAME: (ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 8 ; FNATTRS-NEXT: ret ptr [[GEP]] ; @@ -976,8 +917,6 @@ define ptr @captures_ret_only(ptr %p) { ret ptr %gep } -; Even though the ptrtoint is only used in the return value, this should *not* -; be considered a read-only capture. define i64 @captures_not_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i64 @captures_not_ret_only @@ -996,52 +935,35 @@ define i64 @captures_not_ret_only(ptr %p) { } define void @captures_read_provenance(ptr %p) { -; FNATTRS-LABEL: define void @captures_read_provenance -; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { -; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @captures_read_provenance -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define void @captures_read_provenance +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; COMMON-NEXT: ret void ; call void @capture(ptr captures(address, read_provenance) %p) ret void } define void @captures_unused_ret(ptr %p) { -; FNATTRS-LABEL: define void @captures_unused_ret -; FNATTRS-SAME: (ptr captures(address_is_null) [[P:%.*]]) { -; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @captures_unused_ret -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define void @captures_unused_ret +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; COMMON-NEXT: ret void ; call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret void } define ptr @captures_used_ret(ptr %p) { -; FNATTRS-LABEL: define ptr @captures_used_ret -; FNATTRS-SAME: (ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) { -; FNATTRS-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret ptr [[RET]] -; -; ATTRIBUTOR-LABEL: define ptr @captures_used_ret -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret ptr [[RET]] +; COMMON-LABEL: define ptr @captures_used_ret +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; COMMON-NEXT: ret ptr [[RET]] ; %ret = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret ptr %ret } -; Make sure this is does not produce captures(ret: ...). We need to take the -; return capture components into account when handling argument SCCs. define ptr @scc_capture_via_ret(i1 %c, ptr %p) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @scc_capture_via_ret @@ -1077,72 +999,5 @@ else: ret ptr %p } -define i1 @improve_existing_captures(ptr captures(address) %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @improve_existing_captures -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[P:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null -; FNATTRS-NEXT: ret i1 [[CMP]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @improve_existing_captures -; ATTRIBUTOR-SAME: (ptr nofree readnone captures(address) [[P:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null -; ATTRIBUTOR-NEXT: ret i1 [[CMP]] -; - %cmp = icmp eq ptr %p, null - ret i1 %cmp -} - -define void @dont_increase_existing_captures(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @capture(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_trivial_scc(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures_trivial_scc -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; COMMON-NEXT: call void @dont_increase_existing_captures_trivial_scc(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @capture(ptr captures(address, read_provenance) %p) - call void @dont_increase_existing_captures_trivial_scc(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_scc1(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures_scc1 -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @dont_increase_existing_captures_scc2(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @dont_increase_existing_captures_scc2(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_scc2(ptr %p) { -; FNATTRS-LABEL: define void @dont_increase_existing_captures_scc2 -; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { -; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; FNATTRS-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @dont_increase_existing_captures_scc2 -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) -; ATTRIBUTOR-NEXT: ret void -; - call void @capture(ptr captures(address, read_provenance) %p) - call void @dont_increase_existing_captures_scc1(ptr %p) - ret void -} - declare ptr @llvm.launder.invariant.group.p0(ptr) declare ptr @llvm.strip.invariant.group.p0(ptr) diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index 94093568419af..0f6762f0d4342 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -19,7 +19,7 @@ define ptr @test1() { ; Return a pointer trivially nonnull (argument attribute) define ptr @test2(ptr nonnull %p) { ; FNATTRS-LABEL: define nonnull ptr @test2( -; FNATTRS-SAME: ptr nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: ptr nonnull readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[P]] ; ; ATTRIBUTOR-LABEL: define nonnull ptr @test2( @@ -194,7 +194,7 @@ exit: define ptr @test7(ptr %a) { ; FNATTRS-LABEL: define ptr @test7( -; FNATTRS-SAME: ptr readnone returned captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone returned [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr [[A]] ; ; ATTRIBUTOR-LABEL: define ptr @test7( @@ -206,7 +206,7 @@ define ptr @test7(ptr %a) { define ptr @test8(ptr %a) { ; FNATTRS-LABEL: define nonnull ptr @test8( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -221,7 +221,7 @@ define ptr @test8(ptr %a) { define ptr @test9(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test9( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -238,7 +238,7 @@ declare void @llvm.assume(i1) ; FIXME: missing nonnull define ptr @test10(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test10( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: [[CMP:%.*]] = icmp ne i64 [[N]], 0 ; FNATTRS-NEXT: call void @llvm.assume(i1 [[CMP]]) ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] @@ -263,7 +263,7 @@ define ptr @test10(ptr %a, i64 %n) { ; } define ptr @test11(ptr) local_unnamed_addr { ; FNATTRS-LABEL: define nonnull ptr @test11( -; FNATTRS-SAME: ptr readnone captures(address_is_null, ret: address, provenance) [[TMP0:%.*]]) local_unnamed_addr { +; FNATTRS-SAME: ptr readnone [[TMP0:%.*]]) local_unnamed_addr { ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null ; FNATTRS-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]] ; FNATTRS: 3: @@ -362,7 +362,7 @@ declare nonnull ptr @nonnull() define internal ptr @f1(ptr %arg) { ; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f1( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { +; FNATTRS-SAME: ptr readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = icmp eq ptr [[ARG]], null ; FNATTRS-NEXT: br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]] @@ -431,7 +431,7 @@ bb9: ; preds = %bb4, %bb define internal ptr @f2(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f2(ptr nonnull %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f2( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = tail call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -452,7 +452,7 @@ bb: define dso_local noalias ptr @f3(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f3(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define dso_local noalias nonnull ptr @f3( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -945,7 +945,7 @@ exc: define ptr @gep1(ptr %p) { ; FNATTRS-LABEL: define nonnull ptr @gep1( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -961,7 +961,7 @@ define ptr @gep1(ptr %p) { define ptr @gep1_no_null_opt(ptr %p) #0 { ; Should't be able to derive nonnull based on gep. ; FNATTRS-LABEL: define ptr @gep1_no_null_opt( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR8:[0-9]+]] { +; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR8:[0-9]+]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -976,7 +976,7 @@ define ptr @gep1_no_null_opt(ptr %p) #0 { define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FNATTRS-LABEL: define ptr addrspace(3) @gep2( -; FNATTRS-SAME: ptr addrspace(3) readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 1 ; FNATTRS-NEXT: ret ptr addrspace(3) [[Q]] ; @@ -992,7 +992,7 @@ define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FIXME: We should propagate dereferenceable here but *not* nonnull define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) { ; FNATTRS-LABEL: define noundef ptr addrspace(3) @as( -; FNATTRS-SAME: ptr addrspace(3) readnone returned captures(ret: address, provenance) dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone returned dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr addrspace(3) [[P]] ; ; ATTRIBUTOR-LABEL: define ptr addrspace(3) @as( @@ -1383,7 +1383,7 @@ define void @PR43833_simple(ptr %0, i32 %1) { define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) { ; FNATTRS-LABEL: define ptr @pr91177_non_inbounds_gep( -; FNATTRS-SAME: ptr nonnull readnone captures(ret: address, provenance) [[ARG:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr nonnull readnone [[ARG:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[RES:%.*]] = getelementptr i8, ptr [[ARG]], i64 -8 ; FNATTRS-NEXT: ret ptr [[RES]] ; diff --git a/llvm/test/Transforms/FunctionAttrs/noundef.ll b/llvm/test/Transforms/FunctionAttrs/noundef.ll index 4f53c08804621..b7c583880501a 100644 --- a/llvm/test/Transforms/FunctionAttrs/noundef.ll +++ b/llvm/test/Transforms/FunctionAttrs/noundef.ll @@ -169,7 +169,7 @@ define i64 @test_trunc_with_constexpr() { define align 4 ptr @maybe_not_aligned(ptr noundef %p) { ; CHECK-LABEL: define align 4 ptr @maybe_not_aligned( -; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -177,7 +177,7 @@ define align 4 ptr @maybe_not_aligned(ptr noundef %p) { define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { ; CHECK-LABEL: define noundef align 4 ptr @definitely_aligned( -; CHECK-SAME: ptr noundef readnone returned align 4 captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned align 4 [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -185,7 +185,7 @@ define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { ; CHECK-LABEL: define nonnull ptr @maybe_not_nonnull( -; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -193,7 +193,7 @@ define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { define nonnull ptr @definitely_nonnull(ptr noundef nonnull %p) { ; CHECK-LABEL: define noundef nonnull ptr @definitely_nonnull( -; CHECK-SAME: ptr noundef nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef nonnull readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index 5fc88d623c0ec..b24c097ad54d0 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -35,7 +35,7 @@ define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) { define ptr @test2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define {{[^@]+}}@test2 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: store i32 0, ptr @x, align 4 ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -58,7 +58,7 @@ define ptr @test2(ptr %p) { define i1 @test3(ptr %p, ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test3 -; FNATTRS-SAME: (ptr readnone captures(address) [[P:%.*]], ptr readnone captures(address) [[Q:%.*]]) #[[ATTR1:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone [[P:%.*]], ptr readnone [[Q:%.*]]) #[[ATTR1:[0-9]+]] { ; FNATTRS-NEXT: [[A:%.*]] = icmp ult ptr [[P]], [[Q]] ; FNATTRS-NEXT: ret i1 [[A]] ; @@ -197,7 +197,7 @@ define void @test7_2(ptr preallocated(i32) %a) { define ptr @test8_1(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test8_1 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR1]] { +; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -220,7 +220,7 @@ entry: define void @test8_2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) ; FNATTRS-LABEL: define {{[^@]+}}@test8_2 -; FNATTRS-SAME: (ptr writeonly captures(none) [[P:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: [[CALL:%.*]] = call ptr @test8_1(ptr [[P]]) ; FNATTRS-NEXT: store i32 10, ptr [[CALL]], align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/stats.ll b/llvm/test/Transforms/FunctionAttrs/stats.ll index dc0387e57174a..5f007b4078ff3 100644 --- a/llvm/test/Transforms/FunctionAttrs/stats.ll +++ b/llvm/test/Transforms/FunctionAttrs/stats.ll @@ -16,8 +16,8 @@ entry: ret void } -; CHECK: 1 function-attrs - Number of arguments marked captures(none) -; CHECK-NEXT: 2 function-attrs - Number of functions with improved memory attribute +; CHECK: 2 function-attrs - Number of functions with improved memory attribute +; CHECK-NEXT: 1 function-attrs - Number of arguments marked nocapture ; CHECK-NEXT: 1 function-attrs - Number of functions marked as nofree ; CHECK-NEXT: 2 function-attrs - Number of functions marked as norecurse ; CHECK-NEXT: 2 function-attrs - Number of functions marked as nosync diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll index 1218799ab3dc5..16fb3f34047ee 100644 --- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll +++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll @@ -391,9 +391,8 @@ define i32 @test15e_extra_use(i32 %X) { ;; (a & 128) ? 256 : 0 define i32 @test15e_zext(i8 %X) { ; CHECK-LABEL: @test15e_zext( -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[T2_NOT:%.*]] = icmp sgt i8 [[X:%.*]], -1 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2_NOT]], i32 0, i32 256 ; CHECK-NEXT: ret i32 [[T3]] ; %t1 = and i8 %X, 128 @@ -406,9 +405,7 @@ define i32 @test15e_zext(i8 %X) { define i32 @test15e_zext_extra_use(i8 %X) { ; CHECK-LABEL: @test15e_zext_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 256, i32 0 ; CHECK-NEXT: call void @use1(i1 [[T2]]) ; CHECK-NEXT: ret i32 [[T3]] ; @@ -438,8 +435,7 @@ define i32 @test15f_extra_use(i32 %X) { ; CHECK-LABEL: @test15f_extra_use( ; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 128 ; CHECK-NEXT: [[T2:%.*]] = icmp ne i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[T1]], 1 -; CHECK-NEXT: [[T3:%.*]] = xor i32 [[TMP1]], 256 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 0, i32 256 ; CHECK-NEXT: call void @use1(i1 [[T2]]) ; CHECK-NEXT: ret i32 [[T3]] ; @@ -453,10 +449,9 @@ define i32 @test15f_extra_use(i32 %X) { ;; (a & 128) ? 0 : 256 define i16 @test15f_trunc(i32 %X) { ; CHECK-LABEL: @test15f_trunc( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = shl i16 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP2]], 256 -; CHECK-NEXT: [[T3:%.*]] = xor i16 [[TMP3]], 256 +; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 128 +; CHECK-NEXT: [[T2_NOT:%.*]] = icmp eq i32 [[T1]], 0 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2_NOT]], i16 256, i16 0 ; CHECK-NEXT: ret i16 [[T3]] ; %t1 = and i32 %X, 128 @@ -799,7 +794,9 @@ define i8 @select_bittest_to_xor(i8 %x) { ; CHECK-LABEL: @select_bittest_to_xor( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use1(i1 [[CMP]]) -; CHECK-NEXT: [[MASKSEL:%.*]] = xor i8 [[X]], -128 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 127 +; CHECK-NEXT: [[MASKSEL1:%.*]] = select i1 [[CMP]], i8 -128, i8 0 +; CHECK-NEXT: [[MASKSEL:%.*]] = or disjoint i8 [[AND]], [[MASKSEL1]] ; CHECK-NEXT: ret i8 [[MASKSEL]] ; %cmp = icmp sgt i8 %x, -1 @@ -903,3 +900,15 @@ define i8 @neg_select_trunc_bittest_to_shl_extra_use(i8 %x) { %ret = select i1 %trunc, i8 4, i8 0 ret i8 %ret } + +define i16 @select_trunc_nuw_bittest_or(i8 %x) { +; CHECK-LABEL: @select_trunc_nuw_bittest_or( +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[TMP1]], i16 20, i16 4 +; CHECK-NEXT: ret i16 [[RES]] +; + %trunc = trunc nuw i8 %x to i1 + %select = select i1 %trunc, i16 16, i16 0 + %res = or i16 4, %select + ret i16 %res +} diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll index 7c100f579399d..ca2e23c1d082e 100644 --- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll +++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll @@ -20,6 +20,34 @@ define i32 @select_icmp_eq_and_1_0_or_2(i32 %x, i32 %y) { ret i32 %select } +define i32 @select_icmp_eq_and_1_0_or_2_disjoint(i32 %x, i32 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_disjoint( +; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or disjoint i32 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SELECT]] +; + %and = and i32 %x, 1 + %cmp = icmp eq i32 %and, 0 + %or = or disjoint i32 %y, 2 + %select = select i1 %cmp, i32 %y, i32 %or + ret i32 %select +} + +define i32 @select_icmp_eq_and_1_0_add_2_nsw_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_add_2_nsw_nuw( +; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = add nuw nsw i32 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SELECT]] +; + %and = and i32 %x, 1 + %cmp = icmp eq i32 %and, 0 + %or = add nsw nuw i32 %y, 2 + %select = select i1 %cmp, i32 %y, i32 %or + ret i32 %select +} + define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec( ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], splat (i32 1) @@ -1696,6 +1724,20 @@ define i8 @select_icmp_eq_and_1_0_lshr_fv(i8 %x, i8 %y) { ret i8 %select } +define i8 @select_icmp_eq_and_1_0_lshr_exact_fv(i8 %x, i8 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_lshr_exact_fv( +; CHECK-NEXT: [[AND:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = lshr exact i8 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i8 [[SELECT]] +; + %and = and i8 %x, 1 + %cmp = icmp eq i8 %and, 0 + %blshr = lshr exact i8 %y, 2 + %select = select i1 %cmp, i8 %y, i8 %blshr + ret i8 %select +} + define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_lshr_tv( ; CHECK-NEXT: [[AND:%.*]] = shl i8 [[X:%.*]], 1 @@ -1712,9 +1754,9 @@ define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) { define i8 @select_trunc_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc i8 %x to i1 @@ -1725,9 +1767,9 @@ define i8 @select_trunc_or_2(i8 %x, i8 %y) { define i8 @select_not_trunc_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_not_trunc_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc i8 %x to i1 @@ -1739,9 +1781,8 @@ define i8 @select_not_trunc_or_2(i8 %x, i8 %y) { define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_nuw_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc nuw i8 %x to i1 @@ -1752,9 +1793,9 @@ define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) { define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_nsw_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc nsw i8 %x to i1 @@ -1765,9 +1806,9 @@ define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) { define <2 x i8> @select_trunc_or_2_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @select_trunc_or_2_vec( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc <2 x i8> [[X:%.*]] to <2 x i1> -; CHECK-NEXT: [[OR:%.*]] = or <2 x i8> [[Y:%.*]], splat (i8 2) -; CHECK-NEXT: [[SELECT:%.*]] = select <2 x i1> [[TRUNC]], <2 x i8> [[OR]], <2 x i8> [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], splat (i8 1) +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], splat (i8 2) +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i8> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[SELECT]] ; %trunc = trunc <2 x i8> %x to <2 x i1> diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index 730d488119d13..4a9380b3f35e8 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1321,3 +1321,93 @@ exit: ret i32 %accum } +define i16 @multiple_exit_none_via_latch(ptr %dst, i64 %x) { +; CHECK-LABEL: @multiple_exit_none_via_latch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[X:%.*]], i64 100) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 2, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP4]] +; CHECK-NEXT: store i64 0, ptr [[TMP5]], align 8 +; CHECK-NEXT: store i64 0, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 +; CHECK-NEXT: [[CMP120:%.*]] = icmp slt i64 [[IV]], 100 +; CHECK-NEXT: br i1 [[CMP120]], label [[LOOP_THEN:%.*]], label [[EXIT_2:%.*]] +; CHECK: loop.then: +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[IV]], [[X]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: br label [[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: exit.1: +; CHECK-NEXT: ret i16 0 +; CHECK: exit.2: +; CHECK-NEXT: ret i16 1 +; +; TAILFOLD-LABEL: @multiple_exit_none_via_latch( +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]] +; TAILFOLD: loop.header: +; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: store i64 0, ptr [[GEP]], align 8 +; TAILFOLD-NEXT: [[CMP120:%.*]] = icmp slt i64 [[IV]], 100 +; TAILFOLD-NEXT: br i1 [[CMP120]], label [[LOOP_THEN:%.*]], label [[EXIT_2:%.*]] +; TAILFOLD: loop.then: +; TAILFOLD-NEXT: [[CMP3:%.*]] = icmp ne i64 [[IV]], [[X:%.*]] +; TAILFOLD-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]] +; TAILFOLD: loop.latch: +; TAILFOLD-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; TAILFOLD-NEXT: br label [[LOOP_HEADER]] +; TAILFOLD: exit.1: +; TAILFOLD-NEXT: ret i16 0 +; TAILFOLD: exit.2: +; TAILFOLD-NEXT: ret i16 1 +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i64 0, ptr %gep + %cmp120 = icmp slt i64 %iv, 100 + br i1 %cmp120, label %loop.then, label %exit.2 + +loop.then: + %cmp3 = icmp ne i64 %iv, %x + br i1 %cmp3, label %loop.latch, label %exit.1 + +loop.latch: + %iv.next = add i64 %iv, 1 + br label %loop.header + +exit.1: + ret i16 0 + +exit.2: + ret i16 1 +} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll index d234dedc5a57a..1d09b1c1a0cb3 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll @@ -1,5 +1,8 @@ ;; Test recursion handling during cloning. -;; + +;; -stats requires asserts +; REQUIRES: asserts + ;; Original code looks like: ;; ;; #include @@ -35,13 +38,14 @@ ;; The IR was then reduced using llvm-reduce with the expected FileCheck input. ;; Check behavior when we enable cloning of contexts involved with recursive -;; cycles, but not through the cycle itself. I.e. until full support for -;; recursion is added, the cloned recursive call from C back to B (line 12) will -;; not be updated to call a clone. +;; cycles, but not through the cycle itself. I.e. with full support for cloning +;; recursive cycles off, the cloned recursive call from C back to B (line 12) +;; will not be updated to call a clone. ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS @@ -57,13 +61,13 @@ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" \ ; RUN: --check-prefix=ALL -;; Check the default behavior (enabled recursive callsites). +;; Check the default behavior (clone recursive callsites). ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ -; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS +; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS \ +; RUN: --check-prefix=CLONE-RECUR-CALLSITES ;; Skipping recursive contexts should prevent spurious call to cloned version of ;; B from the context starting at memprof_recursive.cc:19:13, which is actually @@ -73,6 +77,7 @@ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ ; RUN: -memprof-allow-recursive-contexts=false \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS @@ -84,6 +89,7 @@ ;; We should only call the cold clone for the recursive context if we enabled ;; recursive contexts via -memprof-allow-recursive-contexts=true (default). ; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 +; CLONE-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold @@ -95,6 +101,7 @@ ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi assigned to call function clone _Z1Ci ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci assigned to call function clone _Z1Dv ; ALL: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold +; CLONE-RECUR-CALLSITES: 1 memprof-context-disambiguation - Number of backedges with deferred cloning target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll index 7175816963ed1..e01dba328a3a1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll @@ -9,7 +9,7 @@ target triple = "aarch64" define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 { ; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_( -; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(address_is_null) [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0 ; CHECK-NEXT: br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll index d5edf83ee52e2..bbd4849c32296 100644 --- a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll +++ b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll @@ -12,7 +12,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll index 4b422f205138a..ee7698b116aa2 100644 --- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll +++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll @@ -14,7 +14,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll index cd2ed37b22db5..5f75bd788e4bb 100644 --- a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll +++ b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll @@ -11,7 +11,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-LABEL: define void @test( -; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; NOROTATION-NEXT: entry: ; NOROTATION-NEXT: br label [[LOOP_HEADER:%.*]] ; NOROTATION: loop.header: @@ -26,7 +26,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-NEXT: ret void ; ; ROTATION-LABEL: define void @test( -; ROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; ROTATION-NEXT: entry: ; ROTATION-NEXT: [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]] ; ROTATION-NEXT: br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll index c029781142af3..ae851e3319e1f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -13,17 +13,18 @@ define fastcc void @LzmaDec_DecodeReal2(ptr %p, i1 %arg) { ; CHECK-NEXT: [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], ptr [[P:%.*]], i64 0, i32 4 ; CHECK-NEXT: br label [[DO_BODY66_I:%.*]] ; CHECK: do.body66.i: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]] -; CHECK-NEXT: br i1 %arg, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] ; CHECK: if.else.i: -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], undef +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: br label [[DO_COND_I]] ; CHECK: do.cond.i: -; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP1]], [[DO_BODY66_I]] ] -; CHECK-NEXT: br i1 %arg, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] +; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP5]], [[DO_BODY66_I]] ] +; CHECK-NEXT: br i1 [[ARG]], label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] ; CHECK: do.end1006.i: -; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP3]] ; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[RANGE20_I]], align 4 ; CHECK-NEXT: ret void ; @@ -33,25 +34,25 @@ entry: br label %do.body66.i do.body66.i: ; preds = %do.cond.i, %entry - %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ] - %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ] - %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i - %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i + %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ zeroinitializer, %entry ] + %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ zeroinitializer, %entry ] + %.range.2.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %range.2.i + %.code.2.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %code.2.i br i1 %arg, label %do.cond.i, label %if.else.i if.else.i: ; preds = %do.body66.i - %sub91.i = sub i32 %.range.2.i, undef - %sub92.i = sub i32 %.code.2.i, undef + %sub91.i = sub i32 %.range.2.i, zeroinitializer + %sub92.i = sub i32 %.code.2.i, zeroinitializer br label %do.cond.i do.cond.i: ; preds = %if.else.i, %do.body66.i - %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ] + %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ zeroinitializer, %do.body66.i ] %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ] br i1 %arg, label %do.body66.i, label %do.end1006.i do.end1006.i: ; preds = %do.cond.i - %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i - %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i + %.range.4.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %range.4.i + %.code.4.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %code.4.i store i32 %.range.4.i, ptr %range20.i, align 4 store i32 %.code.4.i, ptr %code21.i, align 4 ret void diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test b/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test index 3e39591c46dce..81ca701e78a49 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test +++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test @@ -72,6 +72,7 @@ RUN: llvm-dwarfdump --statistics statistics-fib.split-dwarf.o | FileCheck %s CHECK: "version": 9, CHECK: "#functions": 3, CHECK: "#functions with location": 3, +CHECK: "#out-of-line functions": 3, CHECK: "#inlined functions": 7, CHECK: "#inlined functions with abstract origins": 7, CHECK: "#unique source variables": 9, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test b/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test index 855dcedc76f0b..82939c77e25d4 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test +++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test @@ -67,6 +67,7 @@ RUN: llvm-dwarfdump --statistics %t-statistics-fib.o | FileCheck %s CHECK: "version": 9, CHECK: "#functions": 3, CHECK: "#functions with location": 3, +CHECK: "#out-of-line functions": 3, CHECK: "#inlined functions": 8, CHECK: "#inlined functions with abstract origins": 8, CHECK: "#unique source variables": 9, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll index 05626e60ca0c7..97482e9c9b858 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll @@ -4,6 +4,7 @@ ; Test that abstract origins in multiple CUs are uniqued. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 3, ; CHECK: "#inlined functions": 2, ; CHECK: "#unique source variables": 4, ; CHECK-NEXT: "#source variables": 6, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll index 3e4feca06d56f..25f81f31d18ac 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll @@ -5,6 +5,7 @@ ; The results for both tests should be identical. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 3, ; CHECK: "#inlined functions": 2, ; CHECK: "#unique source variables": 4, ; CHECK-NEXT: "#source variables": 6, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll index 85f66f492ff78..6fd3b84fdc19a 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll @@ -20,6 +20,7 @@ ; CHECK: "#functions": 3, ; CHECK-NEXT: "#functions with location": 3, +; CHECK-NEXT: "#out-of-line functions": 4, ; CHECK-NEXT: "#inlined functions": 0, ; CHECK-NEXT: "#inlined functions with abstract origins": 0, ; CHECK-NEXT: "#unique source variables": 1, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll index 2f1e1e15aa3a9..60ca52a274375 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll @@ -4,6 +4,7 @@ ; Test that statistics distinguish functions with the same name. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 4, ; CHECK: "#unique source variables": 2, ; CHECK-NEXT: "#source variables": 2, diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp index 6f2919318a6d5..1670709c08314 100644 --- a/llvm/tools/llvm-dwarfdump/Statistics.cpp +++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp @@ -971,6 +971,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, SaturatingUINT64 VarParamUnique = 0; SaturatingUINT64 VarParamWithLoc = 0; SaturatingUINT64 NumFunctions = 0; + SaturatingUINT64 NumOutOfLineFunctions = 0; SaturatingUINT64 NumInlinedFunctions = 0; SaturatingUINT64 NumFuncsWithSrcLoc = 0; SaturatingUINT64 NumAbstractOrigins = 0; @@ -999,6 +1000,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, << Entry.getKey() << ": " << V.getKey() << "\n"); NumFunctions += Stats.IsFunction; NumFuncsWithSrcLoc += Stats.HasSourceLocation; + NumOutOfLineFunctions += Stats.IsFunction * Stats.NumFnOutOfLine; NumInlinedFunctions += Stats.IsFunction * Stats.NumFnInlined; NumAbstractOrigins += Stats.IsFunction * Stats.NumAbstractOrigins; ParamTotal += Stats.NumParams; @@ -1024,6 +1026,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, printDatum(J, "#functions", NumFunctions.Value); printDatum(J, "#functions with location", NumFuncsWithSrcLoc.Value); + printDatum(J, "#out-of-line functions", NumOutOfLineFunctions.Value); printDatum(J, "#inlined functions", NumInlinedFunctions.Value); printDatum(J, "#inlined functions with abstract origins", NumAbstractOrigins.Value); diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 2da08127f20a8..fdae09ac767e6 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1624,8 +1624,6 @@ const EnumEntry ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"), \ - ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"), \ - ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"), \ diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp index 3f5c10d935167..73dd82fb921f7 100644 --- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp +++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp @@ -77,9 +77,9 @@ TEST(CaptureTracking, MaxUsesToExplore) { struct CollectingCaptureTracker : public CaptureTracker { SmallVector Captures; void tooManyUses() override { } - Action captured(const Use *U, UseCaptureInfo CI) override { + bool captured(const Use *U) override { Captures.push_back(U); - return Continue; + return false; } }; diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index 0932938b209a4..22181ce33f0da 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -2060,7 +2060,7 @@ TEST_F(AArch64GISelMITest, LibcallFPExt) { auto CheckStr = R"( CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC CHECK: $h0 = COPY [[TRUNC]] - CHECK: BL &__gnu_h2f_ieee + CHECK: BL &__extendhfsf2 CHECK: $d0 = COPY CHECK: BL &__extenddftf2 )"; @@ -2103,7 +2103,7 @@ TEST_F(AArch64GISelMITest, LibcallFPTrunc) { auto CheckStr = R"( CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC CHECK: $s0 = COPY [[TRUNC]] - CHECK: BL &__gnu_f2h_ieee + CHECK: BL &__truncsfhf2 CHECK: $q0 = COPY CHECK: BL &__trunctfdf2 )"; diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 5a73f32ab7c32..8199397854384 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -890,3 +890,96 @@ TEST(RootSignature, ParseRootFlags) { "unsupported root signature flag value read: 4278190081")); } } + +TEST(RootSignature, ParseRootConstant) { + { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<133>(Buffer))); + + const auto &RS = C.getRootSignature(); + ASSERT_TRUE(RS.has_value()); + ASSERT_EQ(RS->getVersion(), 2u); + ASSERT_EQ(RS->getNumParameters(), 1); + ASSERT_EQ(RS->getRootParametersOffset(), 24u); + ASSERT_EQ(RS->getNumStaticSamplers(), 0u); + ASSERT_EQ(RS->getStaticSamplersOffset(), 44u); + ASSERT_EQ(RS->getFlags(), 17u); + + const auto RootParam = RS->getParameters()[0]; + ASSERT_EQ((uint32_t)RootParam.ParameterType, 1u); + ASSERT_EQ((uint32_t)RootParam.ShaderVisibility, 2u); + ASSERT_EQ(RootParam.Constants.ShaderRegister, 15u); + ASSERT_EQ(RootParam.Constants.RegisterSpace, 14u); + ASSERT_EQ(RootParam.Constants.Num32BitValues, 16u); + } + { + // ParameterType has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<133>(Buffer)), + FailedWithMessage("unsupported parameter type value read: 255")); + } + { + // ShaderVisibility has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<133>(Buffer)), + FailedWithMessage("unsupported shader visility flag value read: 255")); + } + { + // Offset has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<133>(Buffer)), + FailedWithMessage("Reading structure out of file bounds")); + } +} diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b48cd9ce53987..fed941f685272 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -127,22 +127,71 @@ TEST(RootSignature, ParseRootFlags) { Size: 24 RootSignature: Version: 2 - NumParameters: 0 - RootParametersOffset: 0 NumStaticSamplers: 0 StaticSamplersOffset: 0 + Parameters: [] AllowInputAssemblerInputLayout: true )")); uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; EXPECT_EQ(Storage.size(), 68u); EXPECT_TRUE(memcmp(Buffer, Storage.data(), 68u) == 0); } + +TEST(RootSignature, ParseRootConstants) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer + Header: + Hash: [ 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, 0x5, + 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1 ] + Version: + Major: 1 + Minor: 0 + FileSize: 133 + PartCount: 1 + PartOffsets: [ 36 ] + Parts: + - Name: RTS0 + Size: 89 + RootSignature: + Version: 2 + NumStaticSamplers: 0 + StaticSamplersOffset: 56 + Parameters: + - ParameterType: Constants32Bit + ShaderVisibility: Hull + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + + EXPECT_EQ(Storage.size(), 133u); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 133u) == 0); +} diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 2ad33659c609b..088264e0429fd 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -6166,3 +6166,26 @@ define void @bar() { // This should not crash, even though there is already a value for LLVMBar. Ctx.createFunction(&LLVMBar); } + +TEST_F(SandboxIRTest, OpaqueValue) { + parseIR(C, R"IR( +declare void @bar(metadata) +define void @foo() { + call void @bar(metadata !1) + call void asm "asm", ""() + ret void +} +!1 = !{} +)IR"); + Function &LLVMFoo = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(&LLVMFoo); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Call = cast(&*It++); + auto *Op0 = Call->getOperand(0); + EXPECT_TRUE(isa(Op0)); + auto *Asm = cast(&*It++); + auto *AsmOp0 = Asm->getOperand(0); + EXPECT_TRUE(isa(AsmOp0)); +} diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 93ac7381b02ef..5d771a1a153f7 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1167,6 +1167,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64CPUTestParams("a64fx", "armv8.2-a"), AArch64CPUTestParams("fujitsu-monaka", "armv9.3-a"), AArch64CPUTestParams("carmel", "armv8.2-a"), + AArch64CPUTestParams("grace", "armv9-a"), AArch64CPUTestParams("saphira", "armv8.4-a"), AArch64CPUTestParams("oryon-1", "armv8.6-a")), AArch64CPUTestParams::PrintToStringParamName); @@ -1247,7 +1248,6 @@ TEST_P(AArch64CPUAliasTestFixture, testCPUAlias) { INSTANTIATE_TEST_SUITE_P( AArch64CPUAliasTests, AArch64CPUAliasTestFixture, ::testing::Values(AArch64CPUAliasTestParams({"neoverse-n2", "cobalt-100"}), - AArch64CPUAliasTestParams({"neoverse-v2", "grace"}), AArch64CPUAliasTestParams({"apple-a7", "cyclone", "apple-a8", "apple-a9"}), AArch64CPUAliasTestParams({"apple-a12", "apple-s4", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn index e3095e2f3df26..f18e40a2a5744 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn @@ -118,6 +118,7 @@ static_library("LLVMRISCVCodeGen") { "RISCVDeadRegisterDefinitions.cpp", "RISCVExpandAtomicPseudoInsts.cpp", "RISCVExpandPseudoInsts.cpp", + "RISCVFoldMemOffset.cpp", "RISCVFrameLowering.cpp", "RISCVGatherScatterLowering.cpp", "RISCVISelDAGToDAG.cpp", diff --git a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h index 517351cac6dbc..0608182f00b7e 100644 --- a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h +++ b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h @@ -23,8 +23,10 @@ #include "mlir-c/Diagnostics.h" #include "mlir-c/IR.h" +// clang-format off #include "mlir/Bindings/Python/Nanobind.h" #include "mlir-c/Bindings/Python/Interop.h" // This is expected after nanobind. +// clang-format on #include "llvm/ADT/Twine.h" // Raw CAPI type casters need to be declared before use, so always include them @@ -349,6 +351,7 @@ class pure_subclass { thisClass = metaclass(derivedClassName, nanobind::make_tuple(superClass), attributes); scope.attr(derivedClassName) = thisClass; + thisClass.attr("__module__") = scope.attr("__name__"); } template @@ -434,7 +437,7 @@ class mlir_attribute_subclass : public pure_subclass { const nanobind::object &superCls, GetTypeIDFunctionTy getTypeIDFunction = nullptr) : pure_subclass(scope, typeClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -465,10 +468,13 @@ class mlir_attribute_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_attribute: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Attribute) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirAttribute other) { return isaFunction(other); }, - nanobind::arg("other_attribute")); + nanobind::arg("other_attribute"), nanobind::sig(kIsinstanceSig)); def("__repr__", [superCls, captureTypeName](nanobind::object self) { return nanobind::repr(superCls(self)) .attr("replace")(superCls.attr("__name__"), captureTypeName); @@ -512,7 +518,7 @@ class mlir_type_subclass : public pure_subclass { const nanobind::object &superCls, GetTypeIDFunctionTy getTypeIDFunction = nullptr) : pure_subclass(scope, typeClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -542,13 +548,17 @@ class mlir_type_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_type: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Type) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirType other) { return isaFunction(other); }, - nanobind::arg("other_type")); + nanobind::arg("other_type"), nanobind::sig(kIsinstanceSig)); def("__repr__", [superCls, captureTypeName](nanobind::object self) { - return nanobind::repr(superCls(self)) - .attr("replace")(superCls.attr("__name__"), captureTypeName); + return nanobind::cast( + nanobind::repr(superCls(self)) + .attr("replace")(superCls.attr("__name__"), captureTypeName)); }); if (getTypeIDFunction) { // 'get_static_typeid' method. @@ -590,7 +600,7 @@ class mlir_value_subclass : public pure_subclass { IsAFunctionTy isaFunction, const nanobind::object &superCls) : pure_subclass(scope, valueClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -620,10 +630,13 @@ class mlir_value_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_value: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Value) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirValue other) { return isaFunction(other); }, - nanobind::arg("other_value")); + nanobind::arg("other_value"), nanobind::sig(kIsinstanceSig)); } }; diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..f795dd89b79a1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision - operations on gfx940+. + operations on gfx94x. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt index ac8c651cdced8..610170f5944eb 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_dialect(EmitC emitc) -add_mlir_doc(EmitC EmitC Dialects/ -gen-dialect-doc) +add_mlir_doc(EmitC EmitC Dialects/ -gen-dialect-doc -dialect emitc) set(LLVM_TARGET_DEFINITIONS EmitCAttributes.td) mlir_tablegen(EmitCEnums.h.inc -gen-enum-decls) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index 72fae1bdbf461..c270b0898f865 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -170,6 +170,10 @@ def LLVM_SinOp : LLVM_UnaryIntrOpF<"sin">; def LLVM_CosOp : LLVM_UnaryIntrOpF<"cos">; def LLVM_TanOp : LLVM_UnaryIntrOpF<"tan">; +def LLVM_ASinOp : LLVM_UnaryIntrOpF<"asin">; +def LLVM_ACosOp : LLVM_UnaryIntrOpF<"acos">; +def LLVM_ATanOp : LLVM_UnaryIntrOpF<"atan">; + def LLVM_SinhOp : LLVM_UnaryIntrOpF<"sinh">; def LLVM_CoshOp : LLVM_UnaryIntrOpF<"cosh">; def LLVM_TanhOp : LLVM_UnaryIntrOpF<"tanh">; diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 01059e42974d0..e9dcd112ce54e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx94x, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx94x. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX94x) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 6a439bfb09078..a5725d6f1507e 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -858,7 +858,11 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz let arguments = (ins Variadic:$inputs, Variadic:$outputs, - DefaultValuedOptionalAttr:$indexing_maps + DefaultValuedOptionalAttr< + AffineMapArrayAttr, + "BatchMatmulOp::getDefaultIndexingMaps($_builder.getContext())" + >:$indexing_maps, + DefaultValuedOptionalAttr:$cast ); let results = (outs Variadic:$result_tensors); let regions = (region AnyRegion:$region); @@ -884,9 +888,10 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz }]>, OpBuilder< (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands, - CArg<"ArrayRef", "{}">:$attributes), + "Attribute":$cast, CArg<"ArrayRef", "{}">:$attributes), [{ $_state.addOperands(operands); + $_state.addAttribute("cast", cast); $_state.addAttributes(attributes); $_state.addTypes(resultTensorTypes); (void)$_state.addRegion(), diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 4d5837ca26c91..7cdf79f4dc59d 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1190,9 +1190,9 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> { }]; let arguments = (ins - Tosa_I1Tensor:$pred, - Tosa_Tensor:$on_true, - Tosa_Tensor:$on_false + Tosa_I1Tensor:$input1, + Tosa_Tensor:$input2, + Tosa_Tensor:$input3 ); let results = (outs @@ -1202,7 +1202,7 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> { let hasFolder = 1; let assemblyFormat = [{ - operands attr-dict `:` `(` type($pred) `,` type($on_true) `,` type($on_false) + operands attr-dict `:` `(` type($input1) `,` type($input2) `,` type($input3) `)` `->` type($output) }]; } diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index c821e7b1527b4..fbbf817ecff98 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -644,11 +644,13 @@ def Vector_ExtractElementOp : Results<(outs AnyType:$result)> { let summary = "extractelement operation"; let description = [{ + Note: This operation is deprecated. Please use vector.extract insert. + Takes a 0-D or 1-D vector and a optional dynamic index position and extracts the scalar at that position. Note that this instruction resembles vector.extract, but is restricted to - 0-D and 1-D vectors and relaxed to dynamic indices. + 0-D and 1-D vectors. If the vector is 0-D, the position must be std::nullopt. @@ -834,11 +836,13 @@ def Vector_InsertElementOp : Results<(outs AnyVectorOfAnyRank:$result)> { let summary = "insertelement operation"; let description = [{ + Note: This operation is deprecated. Please use vector.insert instead. + Takes a scalar source, a 0-D or 1-D destination vector and a dynamic index position and inserts the source into the destination at the proper position. Note that this instruction resembles vector.insert, but is restricted to 0-D - and 1-D vectors and relaxed to dynamic indices. + and 1-D vectors. It is meant to be closer to LLVM's version: https://llvm.org/docs/LangRef.html#insertelement-instruction diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index c62314e504dcc..b29228ef87ea7 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { - if (mfma.getReducePrecision() && chipset >= kGfx940) { + if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); - if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) + if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_32x32x16_i8::getOperationName(); - if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940) + if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_16x16x32_i8::getOperationName(); } @@ -565,7 +565,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (isa(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { // Known to be correct because there are no scalar f8 instructions and // because a length mismatch will have been caught by the verifier. Type sourceBElem = @@ -585,7 +585,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (isa(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { Type sourceBElem = cast(mfma.getSourceB().getType()).getElementType(); if (m == 16 && n == 16 && k == 32 && b == 1) { @@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern { return op->emitOpError("MFMA only supported on gfx908+"); uint32_t getBlgpField = static_cast(op.getBlgp()); if (op.getNegateA() || op.getNegateB() || op.getNegateC()) { - if (chipset < kGfx940) - return op.emitOpError("negation unsupported on older than gfx940"); + if (chipset < kGfx942) + return op.emitOpError("negation unsupported on older than gfx942"); getBlgpField |= op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2); } @@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite( ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite( PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite( PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -1038,7 +1038,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { - using ConvertAMDGPUToROCDLPassBase::ConvertAMDGPUToROCDLPassBase; + using Base::Base; void runOnOperation() override { MLIRContext *ctx = &getContext(); diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 60a002c41bfb2..b22d852f7c543 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -384,7 +384,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() { } bool convertFP8Arithmetic = - maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0); + maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2); arith::populateArithToAMDGPUConversionPatterns( patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz, *maybeChipset); diff --git a/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp b/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp index 5887e37b7f0b4..1f2781aa82114 100644 --- a/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp +++ b/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp @@ -1338,7 +1338,7 @@ void mlir::arith::populateArithToSPIRVPatterns( namespace { struct ConvertArithToSPIRVPass : public impl::ConvertArithToSPIRVPassBase { - using ConvertArithToSPIRVPassBase::ConvertArithToSPIRVPassBase; + using Base::Base; void runOnOperation() override { Operation *op = getOperation(); diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index ea25d5afaeeca..5089179435f1e 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -1072,7 +1072,7 @@ namespace { struct ConvertComplexToStandardPass : public impl::ConvertComplexToStandardPassBase< ConvertComplexToStandardPass> { - using ConvertComplexToStandardPassBase::ConvertComplexToStandardPassBase; + using Base::Base; void runOnOperation() override; }; diff --git a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp index a0ae39a353a95..03f4bf4df4912 100644 --- a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp +++ b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp @@ -28,7 +28,7 @@ namespace { class ConvertControlFlowToSPIRVPass final : public impl::ConvertControlFlowToSPIRVPassBase< ConvertControlFlowToSPIRVPass> { - using ConvertControlFlowToSPIRVPassBase::ConvertControlFlowToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp b/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp index 572a432d6d641..8ed9f659afb10 100644 --- a/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp +++ b/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp @@ -27,7 +27,7 @@ namespace { /// A pass converting MLIR Func operations into the SPIR-V dialect. class ConvertFuncToSPIRVPass : public impl::ConvertFuncToSPIRVPassBase { - using ConvertFuncToSPIRVPassBase::ConvertFuncToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index cfa434699cdef..c3b3a78abe7f7 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -9,6 +9,7 @@ #include "GPUOpsLowering.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/LLVMCommon/VectorPattern.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -586,22 +587,15 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite( return success(); } -/// Unrolls op if it's operating on vectors. -LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands, - ConversionPatternRewriter &rewriter, - const LLVMTypeConverter &converter) { +/// Helper for impl::scalarizeVectorOp. Scalarizes vectors to elements. +/// Used either directly (for ops on 1D vectors) or as the callback passed to +/// detail::handleMultidimensionalVectors (for ops on higher-rank vectors). +static Value scalarizeVectorOpHelper(Operation *op, ValueRange operands, + Type llvm1DVectorTy, + ConversionPatternRewriter &rewriter, + const LLVMTypeConverter &converter) { TypeRange operandTypes(operands); - if (llvm::none_of(operandTypes, llvm::IsaPred)) { - return rewriter.notifyMatchFailure(op, "expected vector operand"); - } - if (op->getNumRegions() != 0 || op->getNumSuccessors() != 0) - return rewriter.notifyMatchFailure(op, "expected no region/successor"); - if (op->getNumResults() != 1) - return rewriter.notifyMatchFailure(op, "expected single result"); - VectorType vectorType = dyn_cast(op->getResult(0).getType()); - if (!vectorType) - return rewriter.notifyMatchFailure(op, "expected vector result"); - + VectorType vectorType = cast(llvm1DVectorTy); Location loc = op->getLoc(); Value result = rewriter.create(loc, vectorType); Type indexType = converter.convertType(rewriter.getIndexType()); @@ -621,9 +615,32 @@ LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands, result = rewriter.create( loc, result, scalarOp->getResult(0), index); } + return result; +} - rewriter.replaceOp(op, result); - return success(); +/// Unrolls op to array/vector elements. +LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands, + ConversionPatternRewriter &rewriter, + const LLVMTypeConverter &converter) { + TypeRange operandTypes(operands); + if (llvm::any_of(operandTypes, llvm::IsaPred)) { + VectorType vectorType = cast(op->getResultTypes()[0]); + rewriter.replaceOp(op, scalarizeVectorOpHelper(op, operands, vectorType, + rewriter, converter)); + return success(); + } + + if (llvm::any_of(operandTypes, llvm::IsaPred)) { + return LLVM::detail::handleMultidimensionalVectors( + op, operands, converter, + [&](Type llvm1DVectorTy, ValueRange operands) -> Value { + return scalarizeVectorOpHelper(op, operands, llvm1DVectorTy, rewriter, + converter); + }, + rewriter); + } + + return rewriter.notifyMatchFailure(op, "no llvm.array or vector to unroll"); } static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) { diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index e73a74845d2b6..bd2fd020f684b 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -172,13 +172,13 @@ struct GPUReturnOpLowering : public ConvertOpToLLVMPattern { }; namespace impl { -/// Unrolls op if it's operating on vectors. +/// Unrolls op to array/vector elements. LogicalResult scalarizeVectorOp(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter, const LLVMTypeConverter &converter); } // namespace impl -/// Rewriting that unrolls SourceOp to scalars if it's operating on vectors. +/// Unrolls SourceOp to array/vector elements. template struct ScalarizeVectorOpLowering : public ConvertOpToLLVMPattern { public: @@ -191,6 +191,7 @@ struct ScalarizeVectorOpLowering : public ConvertOpToLLVMPattern { *this->getTypeConverter()); } }; + } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp index 5d53aef199d52..b06ab44d159af 100644 --- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp +++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp @@ -27,7 +27,7 @@ namespace { /// A pass converting MLIR MemRef operations into the SPIR-V dialect. class ConvertMemRefToSPIRVPass : public impl::ConvertMemRefToSPIRVPassBase { - using ConvertMemRefToSPIRVPassBase::ConvertMemRefToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp index 8e2efbc7f4280..99631705851fd 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp @@ -34,7 +34,7 @@ namespace { // walk the function recursively to avoid considering nested loops. struct ForLoopMapper : public impl::ConvertAffineForToGPUPassBase { - using ConvertAffineForToGPUPassBase::ConvertAffineForToGPUPassBase; + using Base::Base; void runOnOperation() override { for (Operation &op : llvm::make_early_inc_range( diff --git a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp index 9e98dc7d7aaf6..f07386ea80124 100644 --- a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp +++ b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp @@ -29,7 +29,7 @@ namespace { /// A pass converting MLIR Tensor operations into the SPIR-V dialect. class ConvertTensorToSPIRVPass : public impl::ConvertTensorToSPIRVPassBase { - using ConvertTensorToSPIRVPassBase::ConvertTensorToSPIRVPassBase; + using Base::Base; void runOnOperation() override { MLIRContext *context = &getContext(); diff --git a/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp b/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp index 15ddd3f5c16f1..ede3c9e0040fd 100644 --- a/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp +++ b/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp @@ -30,7 +30,7 @@ using namespace tosa; namespace { struct TosaToArith : public impl::TosaToArithPassBase { - using TosaToArithPassBase::TosaToArithPassBase; + using Base::Base; void runOnOperation() override { RewritePatternSet patterns(&getContext()); diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 77f972e0e5894..7459a6503cddf 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } // gfx9 has no to a very limited support for floating-point min and max. if (chipset.majorVersion == 9) { - if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) { + if (chipset >= Chipset(9, 0, 0xa)) { // gfx90a supports f64 max (and min, but we don't have a min wrapper right // now) but all other types need to be emulated. target.addDynamicallyLegalOp( @@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp(); } - if (chipset == Chipset(9, 4, 1)) { - // gfx941 requires non-CAS atomics to be implemented with CAS loops. - // The workaround here mirrors HIP and OpenMP. - target.addIllegalOp(); - } } patterns.add< RawBufferAtomicByCasPattern, diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 147f5dd7a24b6..cfc51ad2a1524 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -3918,14 +3918,24 @@ LogicalResult AffineParallelOp::verify() { } unsigned expectedNumLBResults = 0; - for (APInt v : getLowerBoundsGroups()) - expectedNumLBResults += v.getZExtValue(); + for (APInt v : getLowerBoundsGroups()) { + unsigned results = v.getZExtValue(); + if (results == 0) + return emitOpError() + << "expected lower bound map to have at least one result"; + expectedNumLBResults += results; + } if (expectedNumLBResults != getLowerBoundsMap().getNumResults()) return emitOpError() << "expected lower bounds map to have " << expectedNumLBResults << " results"; unsigned expectedNumUBResults = 0; - for (APInt v : getUpperBoundsGroups()) - expectedNumUBResults += v.getZExtValue(); + for (APInt v : getUpperBoundsGroups()) { + unsigned results = v.getZExtValue(); + if (results == 0) + return emitOpError() + << "expected upper bound map to have at least one result"; + expectedNumUBResults += results; + } if (expectedNumUBResults != getUpperBoundsMap().getNumResults()) return emitOpError() << "expected upper bounds map to have " << expectedNumUBResults << " results"; diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp index 71ea0fd9d43cd..77840690e6a26 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp @@ -289,35 +289,6 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) { SymbolTable::lookupNearestSymbolFrom(callOp, sym)); } -/// Gather equivalence info of CallOps. -/// Note: This only adds new equivalence info if the called function was already -/// analyzed. -// TODO: This does not handle cyclic function call graphs etc. -static void equivalenceAnalysis(func::FuncOp funcOp, - OneShotAnalysisState &state, - FuncAnalysisState &funcState) { - funcOp->walk([&](func::CallOp callOp) { - func::FuncOp calledFunction = getCalledFunction(callOp); - assert(calledFunction && "could not retrieved called func::FuncOp"); - - // No equivalence info available for the called function. - if (!funcState.equivalentFuncArgs.count(calledFunction)) - return WalkResult::skip(); - - for (auto it : funcState.equivalentFuncArgs[calledFunction]) { - int64_t returnIdx = it.first; - int64_t bbargIdx = it.second; - if (!state.isInPlace(callOp->getOpOperand(bbargIdx))) - continue; - Value returnVal = callOp.getResult(returnIdx); - Value argVal = callOp->getOperand(bbargIdx); - state.unionEquivalenceClasses(returnVal, argVal); - } - - return WalkResult::advance(); - }); -} - /// Return "true" if the given function signature has tensor semantics. static bool hasTensorSignature(func::FuncOp funcOp) { return llvm::any_of(funcOp.getFunctionType().getInputs(), @@ -493,9 +464,6 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp, // Now analyzing function. funcState.startFunctionAnalysis(funcOp); - // Gather equivalence info for CallOps. - equivalenceAnalysis(funcOp, state, funcState); - // Analyze funcOp. if (failed(analyzeOp(funcOp, state, statistics))) return failure(); @@ -514,9 +482,6 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp, if (!state.getOptions().isOpAllowed(funcOp)) continue; - // Gather equivalence info for CallOps. - equivalenceAnalysis(funcOp, state, funcState); - // Analyze funcOp. if (failed(analyzeOp(funcOp, state, statistics))) return failure(); diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp index b057554c40d8c..70e05cb4cb383 100644 --- a/mlir/lib/Dialect/DLTI/DLTI.cpp +++ b/mlir/lib/Dialect/DLTI/DLTI.cpp @@ -571,7 +571,8 @@ FailureOr dlti::query(Operation *op, ArrayRef keys, return failure(); MLIRContext *ctx = op->getContext(); - SmallVector entryKeys(keys.size()); + SmallVector entryKeys; + entryKeys.reserve(keys.size()); for (StringRef key : keys) entryKeys.push_back(StringAttr::get(ctx, key)); diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp index 1e8952a7edf4e..eb7ffe2e032c4 100644 --- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp +++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp @@ -991,27 +991,6 @@ LogicalResult emitc::VerbatimOp::verify() { return success(); } -[[maybe_unused]] static ParseResult -parseVariadicTypeFmtArgs(AsmParser &p, SmallVector ¶ms) { - Type type; - if (p.parseType(type)) - return failure(); - - params.push_back(type); - while (succeeded(p.parseOptionalComma())) { - if (p.parseType(type)) - return failure(); - params.push_back(type); - } - - return success(); -} - -[[maybe_unused]] static void printVariadicTypeFmtArgs(AsmPrinter &p, - ArrayRef params) { - llvm::interleaveComma(params, p, [&](Type type) { p.printType(type); }); -} - FailureOr> emitc::VerbatimOp::parseFormatString() { // Error checking is done in verify. return ::parseFormatString(getValue(), getFmtArgs()); diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index b756a67f3ba7a..42ea0e1197ef1 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -3951,11 +3951,18 @@ void BatchMatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, RegionBuilderHelper helper(b, block); SmallVector yields; + TypeFn castVal = TypeFn::cast_signed; + auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { + return attr.getName() == "cast"; + }); + if (castIter != attrs.end()) { + if (auto attr = llvm::dyn_cast(castIter->getValue())) + castVal = attr.getValue(); + } + auto toType = block.getArgument(2).getType(); - Value castValA = - helper.buildTypeFn(TypeFn::cast_signed, toType, block.getArgument(0)); - Value castValB = - helper.buildTypeFn(TypeFn::cast_signed, toType, block.getArgument(1)); + Value castValA = helper.buildTypeFn(castVal, toType, block.getArgument(0)); + Value castValB = helper.buildTypeFn(castVal, toType, block.getArgument(1)); Value mulVal = helper.buildBinaryFn(BinaryFn::mul, castValA, castValB); Value addVal = helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), mulVal); @@ -4004,11 +4011,6 @@ ParseResult BatchMatmulOp::parse(OpAsmParser &parser, OperationState &result) { } void BatchMatmulOp::print(OpAsmPrinter &p) { - SmallVector elidedAttrs = { - "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; - ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); - SmallVector indexingMaps = llvm::map_to_vector( BatchMatmulOp::getDefaultIndexingMaps(getContext()), [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); @@ -4018,6 +4020,11 @@ void BatchMatmulOp::print(OpAsmPrinter &p) { [&](Attribute attr) { p.printAttribute(attr); }); p << "]"; } + + SmallVector elidedAttrs = { + "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; + ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), + elidedAttrs); } /// Verify the user defined indexing maps. diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index 60cae77644291..f4b6955823085 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -1548,10 +1548,9 @@ static Value getCollapsedOpOperand(Location loc, LinalgOp op, /// Modify the `linalg.index` operations in the original generic op, to its /// value in the collapsed operation. -void generateCollapsedIndexingRegion(Location loc, Block *block, - const CollapsingInfo &collapsingInfo, - ValueRange loopRange, - RewriterBase &rewriter) { +static void generateCollapsedIndexingRegion( + Location loc, Block *block, const CollapsingInfo &collapsingInfo, + ArrayRef loopRange, RewriterBase &rewriter) { OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPointToStart(block); @@ -1572,10 +1571,12 @@ void generateCollapsedIndexingRegion(Location loc, Block *block, Value newIndexVal = rewriter.create(loc, foldedDims.index()); for (auto dim : llvm::reverse(foldedDimsRef.drop_front())) { + Value loopDim = + getValueOrCreateConstantIndexOp(rewriter, loc, loopRange[dim]); indexReplacementVals[dim] = - rewriter.create(loc, newIndexVal, loopRange[dim]); + rewriter.createOrFold(loc, newIndexVal, loopDim); newIndexVal = - rewriter.create(loc, newIndexVal, loopRange[dim]); + rewriter.createOrFold(loc, newIndexVal, loopDim); } indexReplacementVals[foldedDims.value().front()] = newIndexVal; } @@ -1722,14 +1723,13 @@ FailureOr mlir::linalg::collapseOpIterationDims( LinalgOp collapsedOp = createCollapsedOp(op, collapsingInfo, rewriter); Location loc = op->getLoc(); + SmallVector loopBound = + llvm::map_to_vector(loopRanges, [](Range range) { return range.size; }); + if (collapsedOp.hasIndexSemantics()) { // Collect the loop range of the generic op. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(collapsedOp); - SmallVector loopBound = - llvm::map_to_vector(loopRanges, [&](Range range) { - return getValueOrCreateConstantIndexOp(rewriter, loc, range.size); - }); generateCollapsedIndexingRegion(loc, &collapsedOp->getRegion(0).front(), collapsingInfo, loopBound, rewriter); } @@ -1747,15 +1747,22 @@ FailureOr mlir::linalg::collapseOpIterationDims( op.getIndexingMapMatchingResult(originalResult.value()); SmallVector reassociation = getOperandReassociation(indexingMap, collapsingInfo); + assert( + indexingMap.isProjectedPermutation() && + "Expected indexing map to be a projected permutation for collapsing"); + SmallVector resultShape = + applyPermutationMap(indexingMap, ArrayRef(loopBound)); Value result; if (isa(collapsedOpResult.getType())) { MemRefType expandShapeResultType = MemRefType::get( originalResultType.getShape(), originalResultType.getElementType()); result = rewriter.create( - loc, expandShapeResultType, collapsedOpResult, reassociation); + loc, expandShapeResultType, collapsedOpResult, reassociation, + resultShape); } else { result = rewriter.create( - loc, originalResultType, collapsedOpResult, reassociation); + loc, originalResultType, collapsedOpResult, reassociation, + resultShape); } results.push_back(result); } else { diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 62e1c4c3ed3b1..d725a457aeff6 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -32,7 +32,6 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" -#include "llvm/Support/Casting.h" #include #include #include diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index b9bcedb7fe71d..9bfc2aae1d6a5 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -65,12 +65,12 @@ void ConcatOp::getCanonicalizationPatterns(RewritePatternSet &results, } LogicalResult SelectOp::canonicalize(SelectOp op, PatternRewriter &rewriter) { - auto notOp = op.getPred().getDefiningOp(); + auto notOp = op.getInput1().getDefiningOp(); if (!notOp) return failure(); rewriter.modifyOpInPlace(op, [&]() { op.getOperation()->setOperands( - {notOp.getInput1(), op.getOnFalse(), op.getOnTrue()}); + {notOp.getInput1(), op.getInput3(), op.getInput2()}); }); return success(); } @@ -1131,18 +1131,18 @@ OpFoldResult SliceOp::fold(FoldAdaptor adaptor) { } OpFoldResult tosa::SelectOp::fold(FoldAdaptor adaptor) { - if (getOnTrue() == getOnFalse()) - return getOnTrue(); + if (getInput2() == getInput3()) + return getInput2(); auto predicate = - llvm::dyn_cast_if_present(adaptor.getPred()); + llvm::dyn_cast_if_present(adaptor.getInput1()); if (!predicate) return {}; if (!predicate.isSplat()) return {}; - return predicate.getSplatValue().getBoolValue() ? getOnTrue() - : getOnFalse(); + return predicate.getSplatValue().getBoolValue() ? getInput2() + : getInput3(); } OpFoldResult TileOp::fold(FoldAdaptor adaptor) { diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp index 79afc75fd6c8e..87b2a2695351b 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp @@ -169,9 +169,9 @@ struct ConvertTosaOp : public OpRewritePattern { LogicalResult matchAndRewrite(tosa::SelectOp tosaOp, PatternRewriter &rewriter) const override { - Value input1 = tosaOp.getPred(); - Value input2 = tosaOp.getOnTrue(); - Value input3 = tosaOp.getOnFalse(); + Value input1 = tosaOp.getInput1(); + Value input2 = tosaOp.getInput2(); + Value input3 = tosaOp.getInput3(); Value output = tosaOp.getResult(); auto outputType = dyn_cast(output.getType()); diff --git a/mlir/lib/Interfaces/FunctionInterfaces.cpp b/mlir/lib/Interfaces/FunctionInterfaces.cpp index 80f47a3f83676..57a8668117c68 100644 --- a/mlir/lib/Interfaces/FunctionInterfaces.cpp +++ b/mlir/lib/Interfaces/FunctionInterfaces.cpp @@ -199,8 +199,7 @@ void function_interface_impl::insertFunctionArguments( // There are 3 things that need to be updated: // - Function type. // - Arg attrs. - // - Block arguments of entry block. - Block &entry = op->getRegion(0).front(); + // - Block arguments of entry block, if not empty. // Update the argument attributes of the function. ArrayAttr oldArgAttrs = op.getArgAttrsAttr(); @@ -226,10 +225,15 @@ void function_interface_impl::insertFunctionArguments( setAllArgAttrDicts(op, newArgAttrs); } - // Update the function type and any entry block arguments. + // Update the function type. op.setFunctionTypeAttr(TypeAttr::get(newType)); - for (unsigned i = 0, e = argIndices.size(); i < e; ++i) - entry.insertArgument(argIndices[i] + i, argTypes[i], argLocs[i]); + + // Update entry block arguments, if not empty. + if (!op.isExternal()) { + Block &entry = op->getRegion(0).front(); + for (unsigned i = 0, e = argIndices.size(); i < e; ++i) + entry.insertArgument(argIndices[i] + i, argTypes[i], argLocs[i]); + } } void function_interface_impl::insertFunctionResults( @@ -279,8 +283,7 @@ void function_interface_impl::eraseFunctionArguments( // There are 3 things that need to be updated: // - Function type. // - Arg attrs. - // - Block arguments of entry block. - Block &entry = op->getRegion(0).front(); + // - Block arguments of entry block, if not empty. // Update the argument attributes of the function. if (ArrayAttr argAttrs = op.getArgAttrsAttr()) { @@ -292,9 +295,14 @@ void function_interface_impl::eraseFunctionArguments( setAllArgAttrDicts(op, newArgAttrs); } - // Update the function type and any entry block arguments. + // Update the function type. op.setFunctionTypeAttr(TypeAttr::get(newType)); - entry.eraseArguments(argIndices); + + // Update entry block arguments, if not empty. + if (!op.isExternal()) { + Block &entry = op->getRegion(0).front(); + entry.eraseArguments(argIndices); + } } void function_interface_impl::eraseFunctionResults( diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py index 5cda4769d593f..c5fbb833ee399 100644 --- a/mlir/python/mlir/dialects/linalg/__init__.py +++ b/mlir/python/mlir/dialects/linalg/__init__.py @@ -149,7 +149,8 @@ def __init__( generic = region_op(GenericOp_, terminator=YieldOp) -def matmul( +def create_op( + op_type, *ins: Union[Operation, OpView, Value], outs: Sequence[Union[Operation, OpView, Value]], indexing_maps: Optional[Sequence[AffineMapAttr]] = None, @@ -161,7 +162,7 @@ def matmul( init = _get_op_result_or_value(outs[0]) result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] - op = MatmulOp( + op = op_type( result_tensors=result_types, inputs=ins, outputs=[init], @@ -172,24 +173,32 @@ def matmul( return op +def matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + return create_op(MatmulOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast) + + +def batch_matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + return create_op( + BatchMatmulOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast + ) + + def contract( *ins: Union[Operation, OpView, Value], outs: Sequence[Union[Operation, OpView, Value]], indexing_maps: Sequence[AffineMapAttr], cast: Optional[Union[TypeFn, Attribute]] = None, ): - ins = [_get_op_result_or_value(input) for input in ins] - if len(outs) > 1: - raise ValueError(f"{outs=} must have length 1.") - init = _get_op_result_or_value(outs[0]) - result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] - - op = ContractOp( - result_tensors=result_types, - inputs=ins, - outputs=[init], - indexing_maps=indexing_maps, - cast=cast, + return create_op( + ContractOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast ) - fill_builtin_region(op.operation) - return op diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir index 7818a525d17b5..a313aaffdf5cc 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s // CHECK-LABEL: func @ext_scalar // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8 diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir index f8a60d37801eb..52db1421dc3c6 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>, %arg2 : vector<16xf32>, %arg3 : vector<4xf32>, %arg4 : vector<4xf16>, %arg5 : vector<4xi8>, diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir index cd921da2294e1..07a428566d488 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt --split-input-file %s \ -// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \ +// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \ // RUN: | FileCheck %s // CHECK-LABEL: func.func @scalar_trunc diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir index 985fb532ea74a..6bb5b9771c015 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s +// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s // CHECK-LABEL: func.func @scalar_ext // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ) diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir index e4b2f01d6544a..9448304f11dbd 100644 --- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir +++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir @@ -513,3 +513,54 @@ module { "test.possible_terminator"() : () -> () }) : () -> () } + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 + // CHECK-LABEL: func @math_sin_vector_1d + func.func @math_sin_vector_1d(%arg : vector<4xf16>) -> vector<4xf16> { + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + %result = math.sin %arg : vector<4xf16> + func.return %result : vector<4xf16> + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 + // CHECK-LABEL: func @math_sin_vector_2d + func.func @math_sin_vector_2d(%arg : vector<2x2xf16>) -> vector<2x2xf16> { + // CHECK: builtin.unrealized_conversion_cast {{.*}} : vector<2x2xf16> to !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.insertvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.insertvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + %result = math.sin %arg : vector<2x2xf16> + func.return %result : vector<2x2xf16> + } +} diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir index 44e484b9ba598..da2913e3fec28 100644 --- a/mlir/test/Dialect/Affine/invalid.mlir +++ b/mlir/test/Dialect/Affine/invalid.mlir @@ -297,6 +297,24 @@ func.func @affine_parallel(%arg0 : index, %arg1 : index, %arg2 : index) { // ----- +func.func @no_upper_bound_affine_parallel() { + // expected-error@+1 {{expected lower bound map to have at least one result}} + affine.parallel (%arg2) = (max()) to (1) { + } + return +} + +// ----- + +func.func @no_upper_bound_affine_parallel() { + // expected-error@+1 {{expected upper bound map to have at least one result}} + affine.parallel (%arg3) = (0) to (min()) { + } + return +} + +// ----- + func.func @vector_load_invalid_vector_type() { %0 = memref.alloc() : memref<100xf32> affine.for %i0 = 0 to 16 step 8 { diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir index 2ca7f7109005c..c947407c63e74 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only dump-alias-sets" -split-input-file | FileCheck %s --check-prefix=CHECK-ALIAS // Run fuzzer with different seeds. // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null @@ -1406,3 +1407,21 @@ func.func @caller(%c: i1, %t0: tensor<5xf32>, %t1: tensor<5xf32>, %t2: tensor<5x return %r : tensor<5xf32> } +// ----- + +// CHECK-ALIAS-LABEL: func @foo +func.func @foo(%arg0: tensor) -> tensor { + // CHECK-ALIAS: return + // CHECK-ALIAS-SAME: __equivalent_func_args__ = [0] + return %arg0 : tensor +} + +// CHECK-ALIAS: func @bar(%[[arg0:.*]]: tensor +func.func @bar(%arg0: tensor) -> tensor { + // CHECK-ALIAS: %[[call:.*]] = call @foo(%[[arg0]]) + // CHECK-ALIAS-SAME: {__inplace_operands_attr__ = ["true"], __opresult_alias_set_attr__ = [{{\[}}"%[[call]]", "%[[arg0]]"]]} + %x = call @foo(%arg0) : (tensor) -> tensor + // CHECK-ALIAS: return + // CHECK-ALIAS-SAME: __equivalent_func_args__ = [0] + return %x : tensor +} diff --git a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir index 7db997cd4c0b5..89734e7542801 100644 --- a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir +++ b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir @@ -225,6 +225,38 @@ func.func @fuse_by_collapsing_dynamic(%arg0 : tensor, // ----- +#map0 = affine_map<(d0, d1) -> (d0, d1)> +func.func @fuse_by_collapsing_dynamic_2(%arg0 : tensor, %sz0: index, %sz1: index) -> tensor { + %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [%sz0, %sz1] : tensor into tensor + %init = tensor.empty(%sz1, %sz0) : tensor + %1 = linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel", "parallel"]} + ins(%0 : tensor) + outs(%init : tensor) { + ^bb0(%b0 : f32, %b1 : f32): + %out = arith.negf %b0 : f32 + linalg.yield %out : f32 + } -> tensor + return %1 : tensor +} + +// CHECK-LABEL: func @fuse_by_collapsing_dynamic_2 +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] +// CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] +// CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[EXPANDED]], %[[C1]] +// CHECK: %[[OUT:.+]] = linalg.generic +// CHECK-SAME: ins(%[[ARG0]] : tensor) +// CHECK-SAME: outs(%{{.*}} : tensor) +// CHECK: %[[EXPANDED_1:.+]] = tensor.expand_shape %[[OUT]] +// CHECK-SAME: output_shape [%[[DIM0]], %[[DIM1]]] +// CHECK: return %[[EXPANDED_1]] + +// ----- + #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3)> func.func @fuse_reductions(%arg0 : tensor<2x?x5xf32>, %arg1 : tensor<2x5xf32>, %sz0: index) -> tensor<2x5xf32> { @@ -425,10 +457,11 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor, %arg1 : tensor<4 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> // CHECK: func @fuse_only_one_reassociation // CHECK-SAME: (%[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor<4x?x?x8xf32>, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index) -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[EXPAND_ARG0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[SZ0]], 4, %[[SZ1]], 8] +// CHECK-DAG: %[[DIM:.+]] = tensor.dim %[[EXPAND_ARG0]], %[[C0]] : tensor +// CHECK-DAG: %[[DIM_2:.+]] = tensor.dim %[[EXPAND_ARG0]], %[[C2]] : tensor // CHECK-DAG: %[[COLLAPSE_ARG0:.+]] = tensor.collapse_shape %[[EXPAND_ARG0]] {{\[}}[0], [1], [2, 3]{{\]}} // CHECK-DAG: %[[COLLAPSE_ARG1_0:.+]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}} // CHECK-DAG: %[[COLLAPSE_ARG1_1:.+]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}} @@ -437,10 +470,7 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor, %arg1 : tensor<4 // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"] // CHECK-SAME: ins(%[[COLLAPSE_ARG0]], %[[COLLAPSE_ARG1_0]] : // CHECK-SAME: outs(%[[COLLAPSE_ARG1_1]] : -// CHECK: %[[DIM:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<4x?x?xf32> -// CHECK: %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C2]] : tensor<4x?x?xf32> -// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C8]] : index -// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[VAL_1]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32> +// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[DIM_2]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32> // CHECK: return %[[EXPANDED_3]] // ----- @@ -475,15 +505,16 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor, %sz0: index, %sz1: // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)> // CHECK: func @fold_non_consecutive_dims( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index) -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C4:.+]] = arith.constant 4 : index -// CHECK: %[[C8:.+]] = arith.constant 8 : index -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[SZ0]], 4, %[[SZ1]], 8] : tensor into tensor -// CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] -// CHECK: %[[DIM_0:.+]] = tensor.dim %[[EXPANDED]], %[[C2]] +// CHECK-DAG: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] +// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[EXPANDED]], %[[C2]] // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM_0]], %[[DIM]]) +// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] +// CHECK-DAG: %[[DIM_2:.+]] = tensor.dim %[[EXPANDED]], %[[C2]] // CHECK: %[[COLLAPSE_INIT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2, 3]{{\]}} // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]] @@ -502,11 +533,7 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor, %sz0: index, %sz1: // CHECK-DAG: %[[T6:.+]] = arith.addi %[[T5]], %[[T3]] // CHECK-DAG: %[[T7:.+]] = arith.index_cast %[[T6]] // CHECK: linalg.yield %[[T7]] -// CHECK: %[[DIM_1:.+]] = tensor.dim %[[GENERIC]], %[[C0]] : tensor -// CHECK: %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C8]] : index -// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C4]] : index -// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 4] : tensor into tensor +// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[DIM_2]], 8, %[[DIM_1]], 4] : tensor into tensor // CHECK: return %[[EXPANDED_3]] // ----- diff --git a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir index 7acbd843cd1e7..fd3c321722508 100644 --- a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir +++ b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir @@ -5,15 +5,14 @@ // CHECK-LABEL: func @reshape // CHECK-SAME: (%[[A:.*]]: tensor, %[[B:.*]]: tensor<16xf32>, %[[INIT:.*]]: tensor, %[[SZ0:.*]]: index) -// CHECK: %[[C112:.*]] = arith.constant 112 : index // CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[A]] +// CHECK: %[[DIM:.*]] = tensor.dim %[[EXPANDED]], %[[C0]] // CHECK: %[[RI:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]] : tensor into tensor // CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]], #[[$MAP2]]], // CHECK-SAME: iterator_types = ["parallel", "parallel"]} // CHECK-SAME: ins(%[[A]], %[[B]] : tensor, tensor<16xf32>) outs(%[[RI]] : tensor) -// CHECK: %[[DIM:.*]] = tensor.dim %[[R]], %[[C0]] : tensor -// CHECK: %[[VAL_1:.*]] = arith.divsi %[[DIM]], %[[C112]] : index -// CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[VAL_1]], 112, 16] : tensor into tensor +// CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[DIM]], 112, 16] : tensor into tensor // CHECK: return %[[RR]] : tensor func.func @reshape(%A: tensor, %B: tensor<16xf32>, %init: tensor, %sz0: index) -> tensor { %0 = tensor.expand_shape %A [[0, 1], [2]] output_shape [%sz0, 112, 16] diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 8474eeac0db5b..1bd9c8825b05e 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1497,7 +1497,7 @@ func.func @matmul_transpose_b(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %a // CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { @@ -1520,7 +1520,7 @@ func.func @batch_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %ar // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_batch_dim_A(%arg0: memref<3x5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { @@ -1543,7 +1543,7 @@ func.func @batch_matmul_bcast_batch_dim_A(%arg0: memref<3x5xf32>, %arg1: memref< // CHECK-SAME: %[[VAL_0:.*]]: memref<2x3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_batch_and_n_dim_B(%arg0: memref<2x3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<2x3x7xf32>) { @@ -1566,7 +1566,7 @@ func.func @batch_matmul_bcast_batch_and_n_dim_B(%arg0: memref<2x3x5xf32>, %arg1: // CHECK-SAME: %[[VAL_0:.*]]: memref<2x3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } @@ -1622,7 +1622,7 @@ func.func @batch_matmul_explicit_transpose_B(%arg0: memref<2x3x5xf32>, %arg1: me // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x7x5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x7x5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x7x5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_A_transpose_B(%arg0: memref<3x5xf32>, %arg1: memref<2x7x5xf32>, %arg2: memref<2x3x7xf32>) { diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir index 6562a7c2ab55c..d0c0c8456d1ca 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir @@ -31,22 +31,22 @@ func.func @depthwise_conv2d_as_mul(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1 // CHECK-LABEL: @depthwise_conv2d_as_mul_q func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<1x1x2x3xi8>, %arg2: tensor<6xi32>) -> tensor<4x10x10x6xi32> { // CHECK-DAG: %[[CONST0:.+]] = tosa.const_shape {value = dense<[4, 10, 10, 2, 1]> : tensor<5xindex> - // CHECK-DAG: %[[iZp:.+]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1x1x1xi32>} - // CHECK-DAG: %[[wZp:.+]] = "tosa.const"() <{value = dense<11> : tensor<1x1x1x1xi32>} + // CHECK-DAG: %[[INPUT_ZP:.+]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1x1x1xi32>} + // CHECK-DAG: %[[WEIGHT_ZP:.+]] = "tosa.const"() <{value = dense<11> : tensor<1x1x1x1xi32>} // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 2, 3]> : tensor<5xindex> // CHECK-DAG: %[[CONST4:.+]] = tosa.const_shape {value = dense<[4, 10, 10, 6]> : tensor<4xindex> // CHECK-DAG: %[[CONST5:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 6]> : tensor<4xindex> // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> - // CHECK: %[[rIn:.+]] = tosa.reshape %arg0, %[[CONST0]] - // CHECK: %[[cIn:.+]] = tosa.cast %[[rIn]] : (tensor<4x10x10x2x1xi8>) -> tensor<4x10x10x2x1xi32> - // CHECK: %[[cWe:.+]] = tosa.cast %arg1 : (tensor<1x1x2x3xi8>) -> tensor<1x1x2x3xi32> - // CHECK: %[[sIn:.+]] = tosa.sub %[[cIn]], %[[iZp]] - // CHECK: %[[sWe:.+]] = tosa.sub %[[cWe]], %[[wZp]] - // CHECK: %[[resWe:.+]] = tosa.reshape %[[sWe]], %[[CONST3]] - // CHECK: %[[mul:.+]] = tosa.mul %[[sIn]], %[[resWe]], %[[SHIFT]] - // CHECK: %[[reO:.+]] = tosa.reshape %[[mul]], %[[CONST4]] - // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2, %[[CONST5]] - // CHECK: %[[add:.+]] = tosa.add %[[reO]], %[[reArg2]] + // CHECK: %[[RESHAPE_I:.+]] = tosa.reshape %arg0, %[[CONST0]] + // CHECK: %[[CAST_I:.+]] = tosa.cast %[[RESHAPE_I]] : (tensor<4x10x10x2x1xi8>) -> tensor<4x10x10x2x1xi32> + // CHECK: %[[CAST_W:.+]] = tosa.cast %arg1 : (tensor<1x1x2x3xi8>) -> tensor<1x1x2x3xi32> + // CHECK: %[[SUB_I:.+]] = tosa.sub %[[CAST_I]], %[[INPUT_ZP]] + // CHECK: %[[SUB_W:.+]] = tosa.sub %[[CAST_W]], %[[WEIGHT_ZP]] + // CHECK: %[[RESHAPE_W:.+]] = tosa.reshape %[[SUB_W]], %[[CONST3]] + // CHECK: %[[MUL:.+]] = tosa.mul %[[SUB_I]], %[[RESHAPE_W]], %[[SHIFT]] + // CHECK: %[[RESHAPE_O:.+]] = tosa.reshape %[[MUL]], %[[CONST4]] + // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[CONST5]] + // CHECK: %[[ADD:.+]] = tosa.add %[[RESHAPE_O]], %[[RESHAPE_ARG2]] %input_zp = "tosa.const"() {value = dense<7> : tensor<1xi8>} : () -> tensor<1xi8> %weight_zp = "tosa.const"() {value = dense<11> : tensor<1xi8>} : () -> tensor<1xi8> %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = i32, pad = array, stride = array, dilation = array } : (tensor<4x10x10x2xi8>, tensor<1x1x2x3xi8>, tensor<6xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<4x10x10x6xi32> @@ -58,19 +58,19 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor< // CHECK-LABEL: @depthwise_conv2d_as_mul_padded func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x12x12x6xf32> { // CHECK-DAG: %[[CONST0:.+]] = tosa.const_shape {value = dense<[4, 10, 10, 2, 1]> : tensor<5xindex>} - // CHECK-DAG: %[[pad:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xindex>} : () -> !tosa.shape<10> - // CHECK-DAG: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} + // CHECK-DAG: %[[PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xindex>} : () -> !tosa.shape<10> + // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 2, 3]> : tensor<5xindex>} // CHECK-DAG: %[[CONST4:.+]] = tosa.const_shape {value = dense<[4, 12, 12, 6]> : tensor<4xindex>} // CHECK-DAG: %[[CONST5:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 6]> : tensor<4xindex>} // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> - // CHECK: %[[reIn:.+]] = tosa.reshape %arg0, %[[CONST0]] - // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, !tosa.shape<10>, tensor) -> tensor<4x12x12x2x1xf32> - // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1, %[[CONST3]] - // CHECK: %[[mul:.+]] = tosa.mul %[[padded]], %[[reArg1]], %[[SHIFT]] - // CHECK: %[[reOut:.+]] = tosa.reshape %[[mul]], %[[CONST4]] - // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2, %[[CONST5]] - // CHECK: %[[add:.+]] = tosa.add %[[reOut]], %[[reArg2]] + // CHECK: %[[RESHAPE_I:.+]] = tosa.reshape %arg0, %[[CONST0]] + // CHECK: %[[PAD_I:.+]] = tosa.pad %[[RESHAPE_I]], %[[PAD]], %[[ZERO]] : (tensor<4x10x10x2x1xf32>, !tosa.shape<10>, tensor) -> tensor<4x12x12x2x1xf32> + // CHECK: %[[RESHAPE_ARG1:.+]] = tosa.reshape %arg1, %[[CONST3]] + // CHECK: %[[MUL:.+]] = tosa.mul %[[PAD_I]], %[[RESHAPE_ARG1]], %[[SHIFT]] + // CHECK: %[[RESHAPE_O:.+]] = tosa.reshape %[[MUL]], %[[CONST4]] + // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[CONST5]] + // CHECK: %[[ADD:.+]] = tosa.add %[[RESHAPE_O]], %[[RESHAPE_ARG2]] %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x12x12x6xf32> return %0 : tensor<4x12x12x6xf32> } diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir index bd18b7ea0fdff..ae7a8e90b4281 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir @@ -120,8 +120,8 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1 // CHECK-DAG: %[[CONV_NEW_SHAPE:.*]] = tosa.const_shape {value = dense<[2, 18, 16, 2, 3, 5]> : tensor<6xindex>} // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]], %[[CONV_NEW_SHAPE]] // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]] - // CHECK-DAG: %[[TEANS_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[2, 36, 48, 5]> : tensor<4xindex>} - // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]], %[[TEANS_NEW_SHAPE]] + // CHECK-DAG: %[[TRANS_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[2, 36, 48, 5]> : tensor<4xindex>} + // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]], %[[TRANS_NEW_SHAPE]] // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]], %[[START]], %[[SIZE]] // CHECK-DAG: %[[ARG2_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 5]> : tensor<4xindex>} // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[ARG2_NEW_SHAPE]] diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index fa590ab495ada..7e714d0f8547a 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -501,9 +501,9 @@ func.func @test_slice(%arg0 : tensor) -> () { // CHECK-LABEL: @test_slice_size_minus_one func.func @test_slice_size_minus_one(%arg0 : tensor) -> () { - // CHECK: %[[Start:.+]] = tosa.const_shape - // CHECK: %[[Size:.+]] = tosa.const_shape - // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[Start]], %[[Size]] : (tensor, !tosa.shape<4>, !tosa.shape<4>) -> tensor + // CHECK: %[[START:.+]] = tosa.const_shape + // CHECK: %[[SIZE:.+]] = tosa.const_shape + // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[START]], %[[SIZE]] : (tensor, !tosa.shape<4>, !tosa.shape<4>) -> tensor // this checks following // dim 0: size=-1, input dim=? => inferred output dim is ? // dim 1: size=-1 => inferred output dim is input_dim - start @@ -519,9 +519,9 @@ func.func @test_slice_size_minus_one(%arg0 : tensor) -> () { // CHECK-LABEL: @test_slice_size_out_of_bound func.func @test_slice_size_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () { - // CHECK: %[[Start:.+]] = tosa.const_shape - // CHECK: %[[Size:.+]] = tosa.const_shape - // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[Start]], %[[Size]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor + // CHECK: %[[START:.+]] = tosa.const_shape + // CHECK: %[[SIZE:.+]] = tosa.const_shape + // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[START]], %[[SIZE]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor // this checks following // dim 0: size=0 => inferred output dim is ? // dim 1: size=-2 => inferred output dim is ? @@ -537,9 +537,9 @@ func.func @test_slice_size_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () { // CHECK-LABEL: @test_slice_start_out_of_bound func.func @test_slice_start_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () { - // CHECK: %[[Start:.+]] = tosa.const_shape - // CHECK: %[[Size:.+]] = tosa.const_shape - // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[Start]], %[[Size]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor + // CHECK: %[[START:.+]] = tosa.const_shape + // CHECK: %[[SIZE:.+]] = tosa.const_shape + // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[START]], %[[SIZE]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor // this checks following // dim 0: start=-1 => inferred output dim is ? // dim 1: start=8 => inferred output dim is ? diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 249a0552c87f3..569b0def37856 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -120,6 +120,25 @@ define void @trig_test(float %0, <8 x float> %1) { ret void } +; CHECK-LABEL: llvm.func @inv_trig_test +define void @inv_trig_test(float %0, <8 x float> %1) { + ; CHECK: llvm.intr.asin(%{{.*}}) : (f32) -> f32 + %3 = call float @llvm.asin.f32(float %0) + ; CHECK: llvm.intr.asin(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %4 = call <8 x float> @llvm.asin.v8f32(<8 x float> %1) + + ; CHECK: llvm.intr.acos(%{{.*}}) : (f32) -> f32 + %5 = call float @llvm.acos.f32(float %0) + ; CHECK: llvm.intr.acos(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %6 = call <8 x float> @llvm.acos.v8f32(<8 x float> %1) + + ; CHECK: llvm.intr.atan(%{{.*}}) : (f32) -> f32 + %7 = call float @llvm.atan.f32(float %0) + ; CHECK: llvm.intr.atan(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %8 = call <8 x float> @llvm.atan.v8f32(<8 x float> %1) + + ret void +} ; CHECK-LABEL: llvm.func @hyperbolic_trig_test define void @hyperbolic_trig_test(float %0, <8 x float> %1) { ; CHECK: llvm.intr.sinh(%{{.*}}) : (f32) -> f32 diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index 2c208789e36dd..3616a2e3c7b21 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -122,6 +122,26 @@ llvm.func @trig_test(%arg0: f32, %arg1: vector<8xf32>) { llvm.return } +// CHECK-LABEL: @inv_trig_test +llvm.func @inv_trig_test(%arg0: f32, %arg1: vector<8xf32>) { + // CHECK: call float @llvm.asin.f32 + llvm.intr.asin(%arg0) : (f32) -> f32 + // CHECK: call <8 x float> @llvm.asin.v8f32 + llvm.intr.asin(%arg1) : (vector<8xf32>) -> vector<8xf32> + + // CHECK: call float @llvm.acos.f32 + llvm.intr.acos(%arg0) : (f32) -> f32 + // CHECK: call <8 x float> @llvm.acos.v8f32 + llvm.intr.acos(%arg1) : (vector<8xf32>) -> vector<8xf32> + + // CHECK: call float @llvm.atan.f32 + llvm.intr.atan(%arg0) : (f32) -> f32 + // CHECK: call <8 x float> @llvm.atan.v8f32 + llvm.intr.atan(%arg1) : (vector<8xf32>) -> vector<8xf32> + + llvm.return +} + // CHECK-LABEL: @hyperbolic_trig_test llvm.func @hyperbolic_trig_test(%arg0: f32, %arg1: vector<8xf32>) { // CHECK: call float @llvm.sinh.f32 diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index 94f8ea4faf4a8..307a88709ad52 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -466,3 +466,103 @@ def matmul_as_contract_op( ) print(module) + + +# CHECK-LABEL: TEST: testBatchMatmulOp +@run +def testBatchMatmulOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + a_shape = (2, 4, 8) + b_shape = (2, 8, 12) + b_transposed_shape = (2, 12, 8) + c_shape = (2, 4, 12) + + dimBatch = ir.AffineDimExpr.get(0) + dimM = ir.AffineDimExpr.get(1) + dimN = ir.AffineDimExpr.get(2) + dimK = ir.AffineDimExpr.get(3) + + # CHECK: #[[$A_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> + # CHECK: #[[$BTrans_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> + # CHECK: #[[$C_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + + a_map = ir.AffineMap.get(4, 0, [dimBatch, dimM, dimK]) + b_transposed_map = ir.AffineMap.get(4, 0, [dimBatch, dimN, dimK]) + c_map = ir.AffineMap.get(4, 0, [dimBatch, dimM, dimN]) + + # CHECK: func.func @batch_matmul_op( + @func.FuncOp.from_py_func( + # CHECK-SAME: %[[A:.*]]: tensor<2x4x8xf32>, + RankedTensorType.get(a_shape, f32), + # CHECK-SAME: %[[Amem:.*]]: memref<2x4x8xf32>, + MemRefType.get(a_shape, f32), + # CHECK-SAME: %[[B:.*]]: tensor<2x8x12xf32>, + RankedTensorType.get(b_shape, f32), + # CHECK-SAME: %[[Bmem:.*]]: memref<2x8x12xf32>, + MemRefType.get(b_shape, f32), + # CHECK-SAME: %[[BTrans:.*]]: tensor<2x12x8xf32>, + RankedTensorType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[BTransmem:.*]]: memref<2x12x8xf32>, + MemRefType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[C:.*]]: tensor<2x4x12xf32>, + RankedTensorType.get(c_shape, f32), + # CHECK-SAME: %[[Cmem:.*]]: memref<2x4x12xf32>) + MemRefType.get(c_shape, f32), + ) + def batch_matmul_op(A, Amem, B, Bmem, Btransposed, Btransposedmem, C, Cmem): + # CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x8x12xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=(C.type,), + inputs=(A, B), + outputs=(C,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x8x12xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.batch_matmul(A, B, outs=(C,)) + + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<2x4x8xf32>, tensor<2x12x8xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=(C.type,), + inputs=(A, Btransposed), + outputs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<2x4x8xf32>, tensor<2x12x8xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.batch_matmul( + A, + Btransposed, + outs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + # CHECK: linalg.batch_matmul ins(%[[Amem]], %[[Bmem]] : memref<2x4x8xf32>, memref<2x8x12xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=[], + inputs=(Amem, Bmem), + outputs=(Cmem,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul ins(%[[Amem]], %[[Bmem]] : memref<2x4x8xf32>, memref<2x8x12xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + linalg.batch_matmul(Amem, Bmem, outs=(Cmem,)) + + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<2x4x8xf32>, memref<2x12x8xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=[], + inputs=(Amem, Btransposedmem), + outputs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<2x4x8xf32>, memref<2x12x8xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + linalg.batch_matmul( + Amem, + Btransposedmem, + outs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + print(module) diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp index bbb6bd6617a13..87df9e19d1842 100644 --- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp +++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp @@ -348,11 +348,11 @@ static mlir::RewriterRegistration rewriteMarkRanges("mark-ranges", "Indicate ranges parsed", markRanges); int main(int argc, char **argv) { - static llvm::cl::opt inputFilename( - llvm::cl::Positional, llvm::cl::desc(""), - llvm::cl::init("-")); + llvm::cl::opt inputFilename(llvm::cl::Positional, + llvm::cl::desc(""), + llvm::cl::init("-")); - static llvm::cl::opt outputFilename( + llvm::cl::opt outputFilename( "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"), llvm::cl::init("-")); diff --git a/mlir/tools/mlir-runner/mlir-runner.cpp b/mlir/tools/mlir-runner/mlir-runner.cpp index 7e8793de03ead..932c9f6cc9fdc 100644 --- a/mlir/tools/mlir-runner/mlir-runner.cpp +++ b/mlir/tools/mlir-runner/mlir-runner.cpp @@ -32,7 +32,7 @@ using namespace mlir; // TODO: Consider removing this linking functionality from the SPIR-V CPU Runner // flow in favour of a more proper host/device split like other runners. // https://github.com/llvm/llvm-project/issues/115348 -llvm::cl::opt LinkNestedModules( +static llvm::cl::opt LinkNestedModules( "link-nested-modules", llvm::cl::desc("Link two nested MLIR modules into a single LLVM IR module. " "Useful if both the host and device code can be run on the " diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp index 414cad5e1dcc2..6cf71d2bb0174 100644 --- a/mlir/tools/mlir-tblgen/DialectGen.cpp +++ b/mlir/tools/mlir-tblgen/DialectGen.cpp @@ -34,7 +34,7 @@ using llvm::Record; using llvm::RecordKeeper; static llvm::cl::OptionCategory dialectGenCat("Options for -gen-dialect-*"); -llvm::cl::opt +static llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::CommaSeparated); diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 43d406e4340f7..dbaad84cda5d6 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -44,11 +44,11 @@ using mlir::tblgen::Operator; //===----------------------------------------------------------------------===// static cl::OptionCategory docCat("Options for -gen-(attrdef|typedef|enum|op|dialect)-doc"); -cl::opt +static cl::opt stripPrefix("strip-prefix", cl::desc("Strip prefix of the fully qualified names"), cl::init("::mlir::"), cl::cat(docCat)); -cl::opt allowHugoSpecificFeatures( +static cl::opt allowHugoSpecificFeatures( "allow-hugo-specific-features", cl::desc("Allows using features specific to Hugo"), cl::init(false), cl::cat(docCat)); diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp index c9f6dd35de44e..c2ad09ffaaed5 100644 --- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp +++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp @@ -36,7 +36,7 @@ using namespace mlir; using tblgen::NamedTypeConstraint; static llvm::cl::OptionCategory dialectGenCat("Options for -gen-irdl-dialect"); -llvm::cl::opt +static llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::Required); diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp index 976ff2e7382ed..570d56f3c6ff1 100644 --- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp +++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp @@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) { EXPECT_EQ(chipset->minorVersion, 0u); EXPECT_EQ(chipset->steppingVersion, 0xau); - chipset = Chipset::parse("gfx940"); + chipset = Chipset::parse("gfx942"); ASSERT_TRUE(succeeded(chipset)); EXPECT_EQ(chipset->majorVersion, 9u); EXPECT_EQ(chipset->minorVersion, 4u); - EXPECT_EQ(chipset->steppingVersion, 0u); + EXPECT_EQ(chipset->steppingVersion, 2u); chipset = Chipset::parse("gfx1103"); ASSERT_TRUE(succeeded(chipset)); @@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) { EXPECT_TRUE(failed(Chipset::parse("navi33"))); EXPECT_TRUE(failed(Chipset::parse("rdna2"))); EXPECT_TRUE(failed(Chipset::parse("sm_80"))); - EXPECT_TRUE(failed(Chipset::parse("GFX940"))); - EXPECT_TRUE(failed(Chipset::parse("Gfx940"))); + EXPECT_TRUE(failed(Chipset::parse("GFX942"))); + EXPECT_TRUE(failed(Chipset::parse("Gfx942"))); EXPECT_TRUE(failed(Chipset::parse("gfx9"))); - EXPECT_TRUE(failed(Chipset::parse("gfx_940"))); - EXPECT_TRUE(failed(Chipset::parse("gfx940_"))); + EXPECT_TRUE(failed(Chipset::parse("gfx_942"))); + EXPECT_TRUE(failed(Chipset::parse("gfx942_"))); EXPECT_TRUE(failed(Chipset::parse("gfxmeow"))); EXPECT_TRUE(failed(Chipset::parse("gfx1fff"))); } TEST(ChipsetTest, Comparison) { - EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0)); - EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2)); + EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2)); EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2)); - EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1)); EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0)); EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8)); EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa)); - EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0)); + EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2)); } } // namespace diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 92184ba796dbd..e83d38a14f77f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2854,12 +2854,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error checkIfAPU() { // TODO: replace with ROCr API once it becomes available. llvm::StringRef StrGfxName(ComputeUnitKind); - IsAPU = llvm::StringSwitch(StrGfxName) - .Case("gfx940", true) - .Default(false); - if (IsAPU) - return Plugin::success(); - bool MayBeAPU = llvm::StringSwitch(StrGfxName) .Case("gfx942", true) .Default(false); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 1e265d2c30904..f017bca85dd4f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -134,12 +134,10 @@ elif config.libomptarget_current_target.startswith('amdgcn'): # amdgpu_test_arch contains a list of AMD GPUs in the system # only check the first one assuming that we will run the test on it. if not (config.amdgpu_test_arch.startswith("gfx90a") or - config.amdgpu_test_arch.startswith("gfx940") or config.amdgpu_test_arch.startswith("gfx942")): supports_unified_shared_memory = False # check if AMD architecture is an APU: - if (config.amdgpu_test_arch.startswith("gfx940") or - (config.amdgpu_test_arch.startswith("gfx942") and + if ((config.amdgpu_test_arch.startswith("gfx942") and evaluate_bool_env(config.environment['IS_APU']))): supports_apu = True if supports_unified_shared_memory: diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 04bf6c3b34dac..b002b29e13747 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -521,13 +521,6 @@ enum library_type { library_throughput }; -#if KMP_OS_LINUX -enum clock_function_type { - clock_function_gettimeofday, - clock_function_clock_gettime -}; -#endif /* KMP_OS_LINUX */ - #if KMP_MIC_SUPPORTED enum mic_type { non_mic, mic1, mic2, mic3, dummy }; #endif @@ -3415,7 +3408,6 @@ extern kmp_bootstrap_lock_t __kmp_threads expansion to co-exist */ extern kmp_lock_t __kmp_global_lock; /* control OS/global access */ -extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access */ extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */ extern enum library_type __kmp_library; @@ -3545,11 +3537,6 @@ extern int __kmp_hot_teams_mode; extern int __kmp_hot_teams_max_level; #endif -#if KMP_OS_LINUX -extern enum clock_function_type __kmp_clock_function; -extern int __kmp_clock_function_param; -#endif /* KMP_OS_LINUX */ - #if KMP_MIC_SUPPORTED extern enum mic_type __kmp_mic_type; #endif diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 5017cd3de4be5..c6446bdb90f63 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -242,11 +242,6 @@ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + // of public intel extension schedules }; -#if KMP_OS_LINUX -enum clock_function_type __kmp_clock_function; -int __kmp_clock_function_param; -#endif /* KMP_OS_LINUX */ - #if KMP_MIC_SUPPORTED enum mic_type __kmp_mic_type = non_mic; #endif @@ -493,8 +488,6 @@ KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock); KMP_ALIGN_CACHE_INTERNODE KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */ KMP_ALIGN_CACHE_INTERNODE -kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access */ -KMP_ALIGN_CACHE_INTERNODE KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */ #else KMP_ALIGN_CACHE @@ -513,8 +506,6 @@ KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock); KMP_ALIGN(128) KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */ KMP_ALIGN(128) -kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access */ -KMP_ALIGN(128) KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */ #endif diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 2c8d9304c46bc..c42c89e106690 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -7143,7 +7143,6 @@ static void __kmp_do_serial_initialize(void) { __kmp_stats_init(); #endif __kmp_init_lock(&__kmp_global_lock); - __kmp_init_queuing_lock(&__kmp_dispatch_lock); __kmp_init_lock(&__kmp_debug_lock); __kmp_init_atomic_lock(&__kmp_atomic_lock); __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);