From 2207e3e32549306bf563c6987f790cabe8d4ea78 Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Wed, 19 Feb 2025 09:06:51 +0800 Subject: [PATCH 001/220] [libc++] Set feature-test macro `__cpp_lib_atomic_float` (#127559) The corresponding feature was implemented in LLVM 18 (by #67799), but this FTM wasn't added before. --- libcxx/docs/FeatureTestMacroTable.rst | 2 +- libcxx/docs/Status/Cxx20Papers.csv | 2 +- libcxx/include/version | 2 +- .../atomic.version.compile.pass.cpp | 48 ++++++------------- .../version.version.compile.pass.cpp | 48 ++++++------------- .../generate_feature_test_macro_components.py | 1 - 6 files changed, 33 insertions(+), 70 deletions(-) diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index ccaa784ccb088..dcf9838edd74b 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -174,7 +174,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_atomic_flag_test`` ``201907L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_atomic_float`` *unimplemented* + ``__cpp_lib_atomic_float`` ``201711L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_atomic_lock_free_type_aliases`` ``201907L`` ---------------------------------------------------------- ----------------- diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv index 360b5520260ce..1c060c32b6f57 100644 --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -2,7 +2,7 @@ "`P0463R1 `__","Endian just Endian","2017-07 (Toronto)","|Complete|","7","" "`P0674R1 `__","Extending make_shared to Support Arrays","2017-07 (Toronto)","|Complete|","15","" "","","","","","" -"`P0020R6 `__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","" +"`P0020R6 `__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","The feature-test macro was not set until LLVM 20." "`P0053R7 `__","C++ Synchronized Buffered Ostream","2017-11 (Albuquerque)","|Complete|","18","" "`P0202R3 `__","Add constexpr modifiers to functions in and Headers","2017-11 (Albuquerque)","|Complete|","12","" "`P0415R1 `__","Constexpr for ``std::complex``\ ","2017-11 (Albuquerque)","|Complete|","16","" diff --git a/libcxx/include/version b/libcxx/include/version index c5966b90c061d..63ead9fd5d29d 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -378,7 +378,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_array_constexpr 201811L # define __cpp_lib_assume_aligned 201811L # define __cpp_lib_atomic_flag_test 201907L -// # define __cpp_lib_atomic_float 201711L +# define __cpp_lib_atomic_float 201711L # define __cpp_lib_atomic_lock_free_type_aliases 201907L # define __cpp_lib_atomic_ref 201806L // # define __cpp_lib_atomic_shared_ptr 201711L diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp index 9ed18fbfe19ac..5a21e6320bffe 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp @@ -169,17 +169,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++20" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++20" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++20" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++20" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -262,17 +256,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++23" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++23" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++23" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -355,17 +343,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++26" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++26" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++26" # endif # ifndef __cpp_lib_atomic_is_always_lock_free diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 137d6cb428930..1e4465d515e6b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -3282,17 +3282,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++20" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++20" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++20" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++20" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -4707,17 +4701,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++23" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++23" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++23" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -6369,17 +6357,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++26" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++26" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++26" # endif # ifndef __cpp_lib_atomic_is_always_lock_free diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 25168b9087754..8bf7633e985d5 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -169,7 +169,6 @@ def add_version_header(tc): "name": "__cpp_lib_atomic_float", "values": {"c++20": 201711}, "headers": ["atomic"], - "unimplemented": True, }, { "name": "__cpp_lib_atomic_is_always_lock_free", From b2659ca44b2e26b558fce66689792709411c7d38 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 19 Feb 2025 09:22:15 +0800 Subject: [PATCH 002/220] [InstCombine] Propagate flags in `foldSelectICmpAndBinOp` (#127437) It is always safe to add poison-generating flags for `BinOp Y, Identity`. Proof: https://alive2.llvm.org/ce/z/8BLEpq and https://alive2.llvm.org/ce/z/584Bb4 Then we can propagate flags from one of the arms: ``` select Cond, Y, (BinOp flags Y, Z) -> select Cond, (BinOp flags Y, Identity), (BinOp flags Y, Z) -> BinOp flags Y, (select Cond, Identity, Z) ``` This patch is proposed to avoid information loss caused by https://github.com/llvm/llvm-project/pull/127390. --- .../InstCombine/InstCombineSelect.cpp | 5 ++- .../InstCombine/select-with-bitwise-ops.ll | 42 +++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 2e14145aef884..cf38fc5f058f2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -829,7 +829,10 @@ static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, if (NeedXor) V = Builder.CreateXor(V, *C2); - return Builder.CreateBinOp(BinOp->getOpcode(), Y, V); + auto *Res = Builder.CreateBinOp(BinOp->getOpcode(), Y, V); + if (auto *BO = dyn_cast(Res)) + BO->copyIRFlags(BinOp); + return Res; } /// Canonicalize a set or clear of a masked set of constant bits to diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll index 7c100f579399d..67dec9178eeca 100644 --- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll +++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll @@ -20,6 +20,34 @@ define i32 @select_icmp_eq_and_1_0_or_2(i32 %x, i32 %y) { ret i32 %select } +define i32 @select_icmp_eq_and_1_0_or_2_disjoint(i32 %x, i32 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_disjoint( +; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or disjoint i32 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SELECT]] +; + %and = and i32 %x, 1 + %cmp = icmp eq i32 %and, 0 + %or = or disjoint i32 %y, 2 + %select = select i1 %cmp, i32 %y, i32 %or + ret i32 %select +} + +define i32 @select_icmp_eq_and_1_0_add_2_nsw_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_add_2_nsw_nuw( +; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = add nuw nsw i32 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SELECT]] +; + %and = and i32 %x, 1 + %cmp = icmp eq i32 %and, 0 + %or = add nsw nuw i32 %y, 2 + %select = select i1 %cmp, i32 %y, i32 %or + ret i32 %select +} + define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec( ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], splat (i32 1) @@ -1696,6 +1724,20 @@ define i8 @select_icmp_eq_and_1_0_lshr_fv(i8 %x, i8 %y) { ret i8 %select } +define i8 @select_icmp_eq_and_1_0_lshr_exact_fv(i8 %x, i8 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_lshr_exact_fv( +; CHECK-NEXT: [[AND:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = lshr exact i8 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i8 [[SELECT]] +; + %and = and i8 %x, 1 + %cmp = icmp eq i8 %and, 0 + %blshr = lshr exact i8 %y, 2 + %select = select i1 %cmp, i8 %y, i8 %blshr + ret i8 %select +} + define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_lshr_tv( ; CHECK-NEXT: [[AND:%.*]] = shl i8 [[X:%.*]], 1 From da47a80d0398d4b5c680ab31a15f48293c731091 Mon Sep 17 00:00:00 2001 From: Ken Matsui <26405363+ken-matsui@users.noreply.github.com> Date: Tue, 18 Feb 2025 20:49:34 -0500 Subject: [PATCH 003/220] [PGO][test] Specify exact match for check labels (#117376) Some check labels in the basic.ll test are simply f[1-3], which matches all cases where the function entry is hot, cold, or uncategorized. Since the actual test results for each label have mixed cases, the current labels can be considered ambiguous. This patch specifies exact matches for each label to ensure more precise validation and to prevent potential regressions in the future. --- llvm/test/Analysis/ProfileSummary/basic.ll | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/Analysis/ProfileSummary/basic.ll b/llvm/test/Analysis/ProfileSummary/basic.ll index c4f48ccafde86..0385c3a921c01 100644 --- a/llvm/test/Analysis/ProfileSummary/basic.ll +++ b/llvm/test/Analysis/ProfileSummary/basic.ll @@ -7,9 +7,9 @@ define void @f1() !prof !20 { ; CHECK-LABEL: f1 :hot -; OVERRIDE-HOT-LABEL: f1 +; OVERRIDE-HOT-LABEL: f1{{$}} ; OVERRIDE-COLD-LABEL: f1 :hot -; OVERRIDE-BOTH-LABEL: f1 +; OVERRIDE-BOTH-LABEL: f1{{$}} ; HOT-CUTOFF-0-LABEL: f1{{$}} ; COLD-CUTOFF-0-LABEL: f1 :cold @@ -19,8 +19,8 @@ define void @f1() !prof !20 { define void @f2() !prof !21 { ; CHECK-LABEL: f2 :cold ; OVERRIDE-HOT-LABEL: f2 :cold -; OVERRIDE-COLD-LABEL: f2 -; OVERRIDE-BOTH-LABEL: f2 +; OVERRIDE-COLD-LABEL: f2{{$}} +; OVERRIDE-BOTH-LABEL: f2 :cold ; HOT-CUTOFF-0-LABEL: f2 :cold ; COLD-CUTOFF-0-LABEL: f2 :cold @@ -28,10 +28,10 @@ define void @f2() !prof !21 { } define void @f3() !prof !22 { -; CHECK-LABEL: f3 -; OVERRIDE-HOT-LABEL: f3 -; OVERRIDE-COLD-LABEL: f3 -; OVERRIDE-BOTH-LABEL: f3 +; CHECK-LABEL: f3 :hot +; OVERRIDE-HOT-LABEL: f3{{$}} +; OVERRIDE-COLD-LABEL: f3 :hot +; OVERRIDE-BOTH-LABEL: f3 :cold ; HOT-CUTOFF-0-LABEL: f3{{$}} ; COLD-CUTOFF-0-LABEL: f3 :cold From b100c5074bb761f1a2ca39c4e274aa2f7e724439 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Tue, 18 Feb 2025 17:57:31 -0800 Subject: [PATCH 004/220] [lld][WebAssembly] Fix warnings in test. NFC (#127714) --- lld/test/wasm/data-segments.ll | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll index 41868a0b2b50b..79f1d384919d9 100644 --- a/lld/test/wasm/data-segments.ll +++ b/lld/test/wasm/data-segments.ll @@ -6,36 +6,36 @@ ; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.pic.o -relocation-model=pic -mattr=+atomics,+bulk-memory,+mutable-globals ; RUN: llc --mtriple=wasm64-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.pic-mem64.o -relocation-model=pic -mattr=+atomics,+bulk-memory,+mutable-globals -; atomics, shared memory => error +;; atomics, shared memory => error ; RUN: not wasm-ld -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.o -o %t.atomics.wasm 2>&1 | FileCheck %s --check-prefix ERROR -; bulk memory, unshared memory => active segments +;; bulk memory, unshared memory => active segments ; RUN: wasm-ld -no-gc-sections --no-entry %t.bulk-mem.o -o %t.bulk-mem.wasm ; RUN: obj2yaml %t.bulk-mem.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE32 -; bulk memory, unshared memory, wasm64 => active segments +;; bulk memory, unshared memory, wasm64 => active segments ; RUN: wasm-ld -mwasm64 -no-gc-sections --no-entry %t.bulk-mem64.o -o %t.bulk-mem64.wasm ; RUN: obj2yaml %t.bulk-mem64.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE64 -; atomics, bulk memory, shared memory => passive segments +;; atomics, bulk memory, shared memory => passive segments ; RUN: wasm-ld -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.o -o %t.atomics.bulk-mem.wasm ; RUN: obj2yaml %t.atomics.bulk-mem.wasm | FileCheck %s --check-prefix PASSIVE ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i32 -; atomics, bulk memory, shared memory, wasm64 => passive segments +;; atomics, bulk memory, shared memory, wasm64 => passive segments ; RUN: wasm-ld -mwasm64 -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem64.o -o %t.atomics.bulk-mem64.wasm ; RUN: obj2yaml %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefix PASSIVE ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i64 -; Also test in combination with PIC/pie +;; Also test in combination with PIC/pie ; RUN: wasm-ld --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic.o -o %t.pic.wasm ; RUN: obj2yaml %t.pic.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE32-PIC -; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 +; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 -; Also test in combination with PIC/pie + wasm64 +;; Also test in combination with PIC/pie + wasm64 ; RUN: wasm-ld -mwasm64 --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic-mem64.o -o %t.pic-mem64.wasm ; RUN: obj2yaml %t.pic-mem64.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE64-PIC -; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 +; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 @a = hidden global [6 x i8] c"hello\00", align 1 @b = hidden global [8 x i8] c"goodbye\00", align 1 @@ -151,7 +151,7 @@ ; PASSIVE-PIC-NEXT: - Index: 2 ; PASSIVE-PIC-NEXT: Name: __wasm_init_memory -; no data relocations. +;; no data relocations. ; DIS-LABEL: <__wasm_call_ctors>: ; DIS-EMPTY: ; DIS-NEXT: end From f6d74af4d9cabb9a940656887c82aaba3ad1f922 Mon Sep 17 00:00:00 2001 From: Ming-Yi Lai Date: Wed, 19 Feb 2025 10:12:54 +0800 Subject: [PATCH 005/220] [clang][X86] Only define __CET__ macro for X86 targets (#127616) The `-fcf-protection` flag is now also used to enable CFI features for the RISC-V target, so it's not suitable to define `__CET__` solely based on the flag anymore. This patch moves the definition of the `__CET__` macro into X86 target hook, so only X86 targets with the `-fcf-protection` flag would enable the `__CET__` macro. See https://github.com/llvm/llvm-project/pull/109784 and https://github.com/llvm/llvm-project/pull/112477 for the adoption of `-fcf-protection` flag for RISC-V targets. --- clang/lib/Basic/Targets/X86.cpp | 4 ++++ clang/lib/Frontend/CompilerInvocation.cpp | 11 ----------- clang/test/Preprocessor/riscv-cf-protection-return.c | 2 ++ 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 84a05cec04e7f..e4d3ad04fe9de 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1109,6 +1109,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasFloat128) Builder.defineMacro("__SIZEOF_FLOAT128__", "16"); + + if (Opts.CFProtectionReturn || Opts.CFProtectionBranch) + Builder.defineMacro("__CET__", Twine{(Opts.CFProtectionReturn << 1) | + Opts.CFProtectionBranch}); } bool X86TargetInfo::isValidFeatureName(StringRef Name) const { diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index b9a5c0589ebc4..4eb743acf327f 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4776,17 +4776,6 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args, } } - // Add the __CET__ macro if a CFProtection option is set. - if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { - StringRef Name = A->getValue(); - if (Name == "branch") - Opts.addMacroDef("__CET__=1"); - else if (Name == "return") - Opts.addMacroDef("__CET__=2"); - else if (Name == "full") - Opts.addMacroDef("__CET__=3"); - } - // Add macros from the command line. for (const auto *A : Args.filtered(OPT_D, OPT_U)) { if (A->getOption().matches(OPT_D)) diff --git a/clang/test/Preprocessor/riscv-cf-protection-return.c b/clang/test/Preprocessor/riscv-cf-protection-return.c index 3a93a88fa6839..a4cbaa1edf68c 100644 --- a/clang/test/Preprocessor/riscv-cf-protection-return.c +++ b/clang/test/Preprocessor/riscv-cf-protection-return.c @@ -40,5 +40,7 @@ // RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ // RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s +// SHSTK-MACRO-NOT: __CET__ // SHSTK-MACRO: __riscv_shadow_stack 1{{$}} +// SHSTK-MACRO-NOT: __CET__ // NO-MACRO-NOT: __riscv_shadow_stack From 715edd70fdbda213668b55405c27c63292516fba Mon Sep 17 00:00:00 2001 From: Chris B Date: Tue, 18 Feb 2025 20:13:23 -0600 Subject: [PATCH 006/220] [HLSL] Allow arrays to copy-initialize (#127557) This change allows array variables to copy-initialize from other arrays. It also corrects a small error in HLSL C-Style casting that did not error on casting to arrays if elementwise and splat conversions fail. Fixes #127551 --- clang/lib/Sema/SemaCast.cpp | 87 ++++++++++++------- clang/lib/Sema/SemaInit.cpp | 12 +++ clang/test/SemaHLSL/Language/AssignArray.hlsl | 34 ++++++++ .../Language/ElementwiseCast-errors.hlsl | 2 +- 4 files changed, 101 insertions(+), 34 deletions(-) create mode 100644 clang/test/SemaHLSL/Language/AssignArray.hlsl diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 8972957ded9f5..89e8082ee80e7 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -104,6 +104,7 @@ namespace { void CheckStaticCast(); void CheckDynamicCast(); void CheckCXXCStyleCast(bool FunctionalCast, bool ListInitialization); + bool CheckHLSLCStyleCast(CheckedConversionKind CCK); void CheckCStyleCast(); void CheckBuiltinBitCast(); void CheckAddrspaceCast(); @@ -2776,39 +2777,9 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, CheckedConversionKind CCK = FunctionalStyle ? CheckedConversionKind::FunctionalCast : CheckedConversionKind::CStyleCast; - - QualType SrcTy = SrcExpr.get()->getType(); - // This case should not trigger on regular vector cast, vector truncation - if (Self.getLangOpts().HLSL && - Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) { - if (SrcTy->isConstantArrayType()) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), - CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK); - Kind = CK_HLSLElementwiseCast; - return; - } - - // This case should not trigger on regular vector splat - // If the relative order of this and the HLSLElementWise cast checks - // are changed, it might change which cast handles what in a few cases - if (Self.getLangOpts().HLSL && - Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) { - const VectorType *VT = SrcTy->getAs(); - // change splat from vec1 case to splat from scalar - if (VT && VT->getNumElements() == 1) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), VT->getElementType(), CK_HLSLVectorTruncation, - SrcExpr.get()->getValueKind(), nullptr, CCK); - // Inserting a scalar cast here allows for a simplified codegen in - // the case the destTy is a vector - if (const VectorType *DVT = DestType->getAs()) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), DVT->getElementType(), - Self.PrepareScalarCast(SrcExpr, DVT->getElementType()), - SrcExpr.get()->getValueKind(), nullptr, CCK); - Kind = CK_HLSLAggregateSplatCast; - return; + if (Self.getLangOpts().HLSL) { + if (CheckHLSLCStyleCast(CCK)) + return; } if (ValueKind == VK_PRValue && !DestType->isRecordType() && @@ -2927,6 +2898,56 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, } } +// CheckHLSLCStyleCast - Returns `true` ihe cast is handled or errored as an +// HLSL-specific cast. Returns false if the cast should be checked as a CXX +// C-Style cast. +bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) { + assert(Self.getLangOpts().HLSL && "Must be HLSL!"); + QualType SrcTy = SrcExpr.get()->getType(); + // HLSL has several unique forms of C-style casts which support aggregate to + // aggregate casting. + // This case should not trigger on regular vector cast, vector truncation + if (Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) { + if (SrcTy->isConstantArrayType()) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), + CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK); + Kind = CK_HLSLElementwiseCast; + return true; + } + + // This case should not trigger on regular vector splat + // If the relative order of this and the HLSLElementWise cast checks + // are changed, it might change which cast handles what in a few cases + if (Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) { + const VectorType *VT = SrcTy->getAs(); + // change splat from vec1 case to splat from scalar + if (VT && VT->getNumElements() == 1) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), VT->getElementType(), CK_HLSLVectorTruncation, + SrcExpr.get()->getValueKind(), nullptr, CCK); + // Inserting a scalar cast here allows for a simplified codegen in + // the case the destTy is a vector + if (const VectorType *DVT = DestType->getAs()) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), DVT->getElementType(), + Self.PrepareScalarCast(SrcExpr, DVT->getElementType()), + SrcExpr.get()->getValueKind(), nullptr, CCK); + Kind = CK_HLSLAggregateSplatCast; + return true; + } + + // If the destination is an array, we've exhausted the valid HLSL casts, so we + // should emit a dignostic and stop processing. + if (DestType->isArrayType()) { + Self.Diag(OpRange.getBegin(), diag::err_bad_cxx_cast_generic) + << 4 << SrcTy << DestType; + SrcExpr = ExprError(); + return true; + } + return false; +} + /// DiagnoseBadFunctionCast - Warn whenever a function call is cast to a /// non-matching type. Such as enum function call to int, int call to /// pointer; etc. Cast to 'void' is an exception. diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 6a76e6d74a4b0..a34005bf376aa 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6585,6 +6585,18 @@ void InitializationSequence::InitializeFrom(Sema &S, } } + if (S.getLangOpts().HLSL && Initializer && isa(DestAT)) { + QualType SrcType = Entity.getType(); + if (SrcType->isArrayParameterType()) + SrcType = + cast(SrcType)->getConstantArrayType(Context); + if (S.Context.hasSameUnqualifiedType(DestType, SrcType)) { + TryArrayCopy(S, Kind, Entity, Initializer, DestType, *this, + TreatUnavailableAsInvalid); + return; + } + } + // Some kinds of initialization permit an array to be initialized from // another array of the same type, and perform elementwise initialization. if (Initializer && isa(DestAT) && diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl new file mode 100644 index 0000000000000..1f813e7a350b1 --- /dev/null +++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -ast-dump | FileCheck %s + +typedef vector int8[2]; + +export void fn(int8 A) { + int8 a = {A}; +// CHECK-LABEL: VarDecl {{.*}} b 'int8':'vector[2]' cinit +// CHECK-NEXT: ArrayInitLoopExpr {{.*}} 'int8':'vector[2]' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'vector' lvalue +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' + int8 b = a; + +// CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector[2]' cinit +// CHECK-NEXT: ArrayInitLoopExpr {{.*}} 'int8':'vector[2]' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'vector' lvalue +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' + int8 c = A; +} + + + + diff --git a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl index 9417249383469..30591507b3260 100644 --- a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl +++ b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl @@ -4,7 +4,7 @@ export void cantCast() { int A[3] = {1,2,3}; int B[4] = {1,2,3,4}; B = (int[4])A; - // expected-error@-1 {{C-style cast from 'int *' to 'int[4]' is not allowed}} + // expected-error@-1 {{C-style cast from 'int[3]' to 'int[4]' is not allowed}} } struct S { From 6662fe393cab2c4e550002c276813a89d9ab4443 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= Date: Wed, 19 Feb 2025 04:04:10 +0100 Subject: [PATCH 007/220] [X86] Add missing vNbf16 handling in X86CallingConv.td file (#127102) Lack of these entries caused clang to crash on the following code: ```c __m256bh fun(__m256bh arg) { return arg; } __m256bh run() { __m256bh arg= {0}; fun(arg); } ``` It caused the FastISel to fail since it handled the call lowering basing on the X86CallingConv table. Curiously, if FastISel fails somewhere down the line and selectionDAGISel fallbacks, the crash does not occur. Following code _does not_ crash: ```c __m256bh fun(__m256bh arg) { return arg; } __m256bh run() { __m256bh arg= {0}; return fun(arg); } ``` This is puzzling to me. Obviously, if FastISel fails then compiler fallbacks to something else to lower these calls -- but since the X86callingConv table _doesn't_ have entries for vNbf16 how does this other thing manage not to crash? It has to use some other mechanism, one which doesn't use the table. This rises following questions: - how is this lowering accomplished without, presumably, using the CallingConv entries? - why is the table not used? I mean this points to some logic duplication (fastISel way vs. the other bug-free way) - How to properly test this? There is a test for vNbf16 values, but it also must not be using the FastISel path? This duplication of logic makes it hard to test this, since we don't have direct control whether the FastISel path or the other one is used. Nonetheless, this PR fixes the crash, though I didn't create a test for it, since I am unsure yet how it should look like. I would like to learn how the working non-FastISel mechanism works; I tried looking for it, but didn't yet manage to find anything --- llvm/lib/Target/X86/X86CallingConv.td | 48 +- llvm/test/CodeGen/X86/bfloat-calling-conv.ll | 1162 ++++++++++++++++++ 2 files changed, 1186 insertions(+), 24 deletions(-) create mode 100644 llvm/test/CodeGen/X86/bfloat-calling-conv.ll diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 72b103b0bb0c5..cf164acba9ec0 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -267,19 +267,19 @@ def RetCC_X86Common : CallingConv<[ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX target feature. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // Long double types are always returned in FP0 (even with SSE), @@ -565,7 +565,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -574,13 +574,13 @@ def CC_X86_64_C : CallingConv<[ // FIXME: This isn't precisely correct; the x86-64 ABI document says that // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. - CCIfNotVarArg>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>>, @@ -593,14 +593,14 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -631,13 +631,13 @@ def CC_X86_Win64_C : CallingConv<[ CCIfCFGuardTarget>, // 128 bit vectors are passed by pointer - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCPassIndirect>, // 256 bit vectors are passed by pointer - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect>, + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer - CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect>, + CCIfType<[v64i8, v32i16, v16i32, v32f16, v32bf16, v16f32, v8f64, v8i64], CCPassIndirect>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, @@ -734,15 +734,15 @@ def CC_X86_64_AnyReg : CallingConv<[ /// values are spilled on the stack. def CC_X86_32_Vector_Common : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -750,15 +750,15 @@ def CC_X86_32_Vector_Common : CallingConv<[ /// values are spilled on the stack. def CC_X86_Win32_Vector : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 4-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 4>>, // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 4>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 4>> ]>; @@ -766,16 +766,16 @@ def CC_X86_Win32_Vector : CallingConv<[ // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCIfIsVarArgOnWin>, @@ -786,16 +786,16 @@ def CC_X86_32_Vector_Standard : CallingConv<[ // vector registers. def CC_X86_32_Vector_Darwin : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll new file mode 100644 index 0000000000000..ea4d32bae9ccb --- /dev/null +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll @@ -0,0 +1,1162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=SSE2 %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=FAST_ISEL_SSE2 %s +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=AVX512BF16 %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=FAST_ISEL_AVX512BF16 %s +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=AVXNECONVERT %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=FAST_ISEL_AVXNECONVERT %s + +define bfloat @return_arg_bf16(bfloat %x) #0 { +; SSE2-LABEL: return_arg_bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %rax +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: popq %rax +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $0, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret bfloat %x +} + +define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v2bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v2bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: addq $40, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v2bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v2bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v2bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v2bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <2 x bfloat> %x +} + +define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v3bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v3bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: addq $40, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v3bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v3bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $2, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $1, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v3bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v3bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $2, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $1, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm1, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <3 x bfloat> %x +} + +define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v4bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; FAST_ISEL_SSE2-NEXT: addq $56, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v4bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v4bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v4bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v4bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <4 x bfloat> %x +} + +define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v8bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FAST_ISEL_SSE2-NEXT: addq $56, %rsp +; FAST_ISEL_SSE2-NEXT: popq %rbx +; FAST_ISEL_SSE2-NEXT: popq %r14 +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v8bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v8bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v8bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v8bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <8 x bfloat> %x +} + +define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 { +; +; SSE2-LABEL: return_arg_v16bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v16bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $104, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: addq $104, %rsp +; FAST_ISEL_SSE2-NEXT: popq %rbx +; FAST_ISEL_SSE2-NEXT: popq %r14 +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v16bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v16bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v16bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v16bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <16 x bfloat> %x +} + +declare bfloat @returns_bf16(bfloat) +declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>) +declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>) +declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>) +declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>) +declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>) + +define bfloat @call_ret_bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE2-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %rax +; FAST_ISEL_SSE2-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: callq returns_bf16@PLT +; +; AVX512BF16-LABEL: call_ret_bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX512BF16-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVXNECONVERT-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_bf16@PLT + %val = load bfloat, ptr %ptr + call bfloat @returns_bf16(bfloat %val) + unreachable +} + +define <2 x bfloat> @call_ret_v2bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v2bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v2bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: movl (%rdi), %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, (%rsp) +; FAST_ISEL_SSE2-NEXT: movdqa (%rsp), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq returns_v2bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v2bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BF16-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v2bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v2bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v2bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVXNECONVERT-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v2bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v2bf16@PLT + %val = load <2 x bfloat>, ptr %ptr + call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val) + unreachable +} + +define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v3bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movl 4(%rdi), %eax +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v3bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax +; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx +; FAST_ISEL_SSE2-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx +; FAST_ISEL_SSE2-NEXT: shll $16, %ecx +; FAST_ISEL_SSE2-NEXT: movd %ecx, %xmm0 +; FAST_ISEL_SSE2-NEXT: shrq $32, %rax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq returns_v3bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v3bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v3bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: movq (%rdi), %rax +; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx +; FAST_ISEL_AVX512BF16-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %ecx +; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: shrq $32, %rax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm2, %eax +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v3bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v3bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: movl 4(%rdi), %eax +; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v3bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movq (%rdi), %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm2, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm0, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v3bf16@PLT + %val = load <3 x bfloat>, ptr %ptr + call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val) + unreachable +} + +define <4 x bfloat> @call_ret_v4bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v4bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; FAST_ISEL_SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; FAST_ISEL_SSE2-NEXT: callq returns_v4bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v4bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v4bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v4bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v4bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVXNECONVERT-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v4bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v4bf16@PLT + %val = load <4 x bfloat>, ptr %ptr + call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val) + unreachable +} + +define <8 x bfloat> @call_ret_v8bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v8bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1 +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FAST_ISEL_SSE2-NEXT: callq returns_v8bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v8bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BF16-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v8bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v8bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v8bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0 +; AVXNECONVERT-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v8bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v8bf16@PLT + %val = load <8 x bfloat>, ptr %ptr + call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val) + unreachable +} + +define <16 x bfloat> @call_ret_v16bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v16bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v16bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $104, %rsp +; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: callq returns_v16bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v16bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BF16-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v16bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %ymm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v16bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v16bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0 +; AVXNECONVERT-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v16bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v16bf16@PLT + %val = load <16 x bfloat>, ptr %ptr + call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val) + unreachable +} + +attributes #0 = { nounwind } From aed9f11965d44e86fa5a53c7a6c1dfc7d8cbe6b8 Mon Sep 17 00:00:00 2001 From: Chaitanya Date: Wed, 19 Feb 2025 08:50:23 +0530 Subject: [PATCH 008/220] [AMDGPU] Handle lowering addrspace casts from LDS to FLAT address in amdgpu-sw-lower-lds. (#121214) "infer-address-spaces" pass replaces all refinable generic pointers with equivalent specific pointers. At -O0 optimisation level, infer-address-spaces pass doesn't run in the pipeline. "amdgpu-sw-lower-lds" pass instruments memory operations on addrspace(3) ptrs. Since, extra addrspacecasts are present from lds to flat addrspaces at -O0 and the actual store/load memory instructions are now on flat addrspace, these addrspacecast need to be handled in the amdgpu-sw-lower-lds pass itself. This patch lowers the lds ptr first to the corresponding ptr in the global memory from the asan_malloc. Then replaces the original cast with addrspacecast from global ptr to flat ptr. --- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 47 ++- ...gpu-sw-lower-lds-non-kernel-declaration.ll | 8 +- ...w-lower-lds-static-indirect-access-asan.ll | 98 ++--- ...-lds-static-indirect-access-nested-asan.ll | 334 +++++++++--------- ...lower-lds-static-indirect-access-nested.ll | 134 +++---- ...static-indirect-access-no-kernel-lds-id.ll | 8 +- ...gpu-sw-lower-lds-static-indirect-access.ll | 39 +- .../amdgpu-sw-lower-lds-static-lds-O0.ll | 76 ++++ ...gpu-sw-lower-lds-static-lds-vector-ptrs.ll | 95 +++++ 9 files changed, 539 insertions(+), 300 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index c0581e491720d..3159b497a1ecb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -192,8 +192,7 @@ class AMDGPUSwLowerLDS { void getLDSMemoryInstructions(Function *Func, SetVector &LDSInstructions); void replaceKernelLDSAccesses(Function *Func); - Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, - Value *LDSPtr); + Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr); void translateLDSMemoryOperationsToGlobalMemory( Function *Func, Value *LoadMallocPtr, SetVector &LDSInstructions); @@ -655,20 +654,30 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions( } else if (AtomicCmpXchgInst *XCHG = dyn_cast(&Inst)) { if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) LDSInstructions.insert(&Inst); + } else if (AddrSpaceCastInst *ASC = dyn_cast(&Inst)) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) + LDSInstructions.insert(&Inst); } else continue; } } } -Value * -AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, +Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr) { assert(LDSPtr && "Invalid LDS pointer operand"); - Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty()); - Value *GEP = - IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt}); - return GEP; + Type *LDSPtrType = LDSPtr->getType(); + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + if (auto *VecPtrTy = dyn_cast(LDSPtrType)) { + // Handle vector of pointers + ElementCount NumElements = VecPtrTy->getElementCount(); + IntTy = VectorType::get(IntTy, NumElements); + } + Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy); + return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex}); } void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( @@ -681,7 +690,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( if (LoadInst *LI = dyn_cast(Inst)) { Value *LIOperand = LI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand); LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement, LI->getAlign(), LI->isVolatile()); NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); @@ -691,7 +700,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (StoreInst *SI = dyn_cast(Inst)) { Value *SIOperand = SI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand); StoreInst *NewSI = IRB.CreateAlignedStore( SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); @@ -701,8 +710,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { Value *RMWPtrOperand = RMW->getPointerOperand(); Value *RMWValOperand = RMW->getValOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, RMWPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand); AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW( RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(), RMW->getOrdering(), RMW->getSyncScopeID()); @@ -712,8 +721,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( RMW->eraseFromParent(); } else if (AtomicCmpXchgInst *XCHG = dyn_cast(Inst)) { Value *XCHGPtrOperand = XCHG->getPointerOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, XCHGPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand); AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg( Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(), XCHG->getAlign(), XCHG->getSuccessOrdering(), @@ -722,6 +731,16 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( AsanInfo.Instructions.insert(NewXCHG); XCHG->replaceAllUsesWith(NewXCHG); XCHG->eraseFromParent(); + } else if (AddrSpaceCastInst *ASC = dyn_cast(Inst)) { + Value *AIOperand = ASC->getPointerOperand(); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand); + Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType()); + // Note: No need to add the instruction to AsanInfo instructions to be + // instrumented list. FLAT_ADDRESS ptr would have been already + // instrumented by asan pass prior to this pass. + ASC->replaceAllUsesWith(NewAI); + ASC->eraseFromParent(); } else report_fatal_error("Unimplemented LDS lowering instruction"); } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll index ae2bcbbb81b5f..a6e6b84bba304 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll @@ -20,8 +20,12 @@ define void @non_kernel_function() sanitize_address { ; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] -; CHECK-NEXT: [[Y:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(1) [[TMP13]] to ptr ; CHECK-NEXT: store i8 5, ptr [[TMP9]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll index 3a05f93df35a3..b9b4c90daea87 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP34:%.*]] = addrspacecast ptr addrspace(1) [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP35]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP36]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -45,16 +49,16 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP25:%.*]] = and i1 [[TMP21]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP25]]) ; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 0 -; CHECK-NEXT: br i1 [[TMP27]], label [[ASAN_REPORT:%.*]], label [[TMP30:%.*]], !prof [[PROF2:![0-9]+]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP25]], label [[TMP28:%.*]], label [[TMP29:%.*]] -; CHECK: 28: +; CHECK-NEXT: br i1 [[TMP27]], label %[[ASAN_REPORT:.*]], label %[[BB35:.*]], !prof [[PROF2:![0-9]+]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP25]], label %[[BB33:.*]], label %[[BB34:.*]] +; CHECK: [[BB33]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7:[0-9]+]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP29]] -; CHECK: 29: -; CHECK-NEXT: br label [[TMP30]] -; CHECK: 30: +; CHECK-NEXT: br label %[[BB34]] +; CHECK: [[BB34]]: +; CHECK-NEXT: br label %[[BB35]] +; CHECK: [[BB35]]: ; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8 ; CHECK-NEXT: ret void ; @@ -67,15 +71,15 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -100,9 +104,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -124,16 +128,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0 -; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT:%.*]], label [[TMP46:%.*]], !prof [[PROF2]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[CONDFREE:%.*]] -; CHECK: 44: +; CHECK-NEXT: br i1 [[TMP43]], label %[[ASAN_REPORT:.*]], label %[[BB46:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP41]], label %[[BB44:.*]], label %[[BB45:.*]] +; CHECK: [[BB44]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[CONDFREE]] -; CHECK: 45: -; CHECK-NEXT: br label [[TMP46]] -; CHECK: 46: +; CHECK-NEXT: br label %[[BB45]] +; CHECK: [[BB45]]: +; CHECK-NEXT: br label %[[BB46]] +; CHECK: [[BB46]]: ; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1 ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] @@ -152,16 +156,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP54]], [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]]) ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0 -; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT1:%.*]], label [[TMP64:%.*]], !prof [[PROF2]] -; CHECK: asan.report1: -; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]] -; CHECK: 64: +; CHECK-NEXT: br i1 [[TMP61]], label %[[ASAN_REPORT1:.*]], label %[[BB66:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT1]]: +; CHECK-NEXT: br i1 [[TMP59]], label %[[BB64:.*]], label %[[BB65:.*]] +; CHECK: [[BB64]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP83]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP63]] -; CHECK: 65: -; CHECK-NEXT: br label [[TMP64]] -; CHECK: 66: +; CHECK-NEXT: br label %[[BB65]] +; CHECK: [[BB65]]: +; CHECK-NEXT: br label %[[BB66]] +; CHECK: [[BB66]]: ; CHECK-NEXT: [[TMP84:%.*]] = ptrtoint ptr addrspace(1) [[TMP82]] to i64 ; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP84]], 3 ; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP85]], 2147450880 @@ -174,28 +178,28 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP76:%.*]] = and i1 [[TMP72]], [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP76]]) ; CHECK-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP77]], 0 -; CHECK-NEXT: br i1 [[TMP78]], label [[ASAN_REPORT2:%.*]], label [[TMP81:%.*]], !prof [[PROF2]] -; CHECK: asan.report2: -; CHECK-NEXT: br i1 [[TMP76]], label [[TMP79:%.*]], label [[TMP80:%.*]] -; CHECK: 79: +; CHECK-NEXT: br i1 [[TMP78]], label %[[ASAN_REPORT2:.*]], label %[[BB81:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT2]]: +; CHECK-NEXT: br i1 [[TMP76]], label %[[BB79:.*]], label %[[BB80:.*]] +; CHECK: [[BB79]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP84]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP80]] -; CHECK: 80: -; CHECK-NEXT: br label [[TMP81]] -; CHECK: 81: +; CHECK-NEXT: br label %[[BB80]] +; CHECK: [[BB80]]: +; CHECK-NEXT: br label %[[BB81]] +; CHECK: [[BB81]]: ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll index 1dd391ec6321a..255dda562c1ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,50 +6,64 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,56 +72,56 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,48 +130,48 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,56 +180,56 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -237,14 +251,16 @@ define private void @store_A() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: store ptr [[TMP11]], ptr null, align 8 ; CHECK-NEXT: ret void ; store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null @@ -256,14 +272,16 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: ret ptr [[TMP10]] +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: ret ptr [[TMP11]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) } @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll index ed9107764eb91..7184ebbb8faa3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,18 +6,32 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -33,23 +47,23 @@ define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,16 +72,16 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -90,24 +104,24 @@ define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,16 +130,16 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -141,23 +155,23 @@ define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,16 +180,16 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -198,24 +212,24 @@ define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -243,7 +257,9 @@ define private void @store_A() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 ; CHECK-NEXT: ret void ; @@ -262,7 +278,9 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: ret ptr [[TMP10]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll index b9fa89dd6f0a6..704bc9e635294 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll @@ -29,8 +29,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP9]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) [[TMP10]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP11]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(1) [[TMP19]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP12]] to i32 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll index 11e912287c7f7..8f5abe962f8eb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = addrspacecast ptr addrspace(1) [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -44,16 +48,16 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -78,9 +82,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -94,17 +98,17 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() @@ -124,5 +128,6 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} -; CHECK: [[META1]] = !{i32 0} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +; CHECK: [[META2]] = !{i32 0} ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll new file mode 100644 index 0000000000000..1973a0acf4659 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s +@lds = internal addrspace(3) global [5 x i32] poison, align 16 + +;. +; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 16, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 20, i32 64 } }, no_sanitize_address +;. +define amdgpu_kernel void @k0() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 52 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 44) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = addrspacecast ptr addrspace(1) [[TMP23]] to ptr +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [5 x i32], ptr [[TMP24]], i64 0, i64 0 +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + %gep = getelementptr inbounds [5 x i32], ptr addrspacecast (ptr addrspace(3) @lds to ptr), i64 0, i64 0 + store i32 1, ptr %gep, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll new file mode 100644 index 0000000000000..34caf91def933 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if vector of static LDS ptrs accesses in kernel are lowered correctly. +@lds_var1 = internal addrspace(3) global i32 poison +@lds_var2 = internal addrspace(3) global i32 poison + +;. +; CHECK: @llvm.amdgcn.sw.lds.example = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.example.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.example.md.type { %llvm.amdgcn.sw.lds.example.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address +;. +define amdgpu_kernel void @example() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @example( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[ENTRY:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: br label %[[ENTRY]] +; CHECK: [[ENTRY]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP23]] +; CHECK-NEXT: [[VEC_LDS_PTRS:%.*]] = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) [[TMP22]], i32 0 +; CHECK-NEXT: [[VEC_LDS_PTRS1:%.*]] = insertelement <2 x ptr addrspace(3)> [[VEC_LDS_PTRS]], ptr addrspace(3) [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VEC_LDS_PTRS1]] to <2 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP32:%.*]] = addrspacecast <2 x ptr addrspace(1)> [[TMP31]] to <2 x ptr> +; CHECK-NEXT: [[ELEM0:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 0 +; CHECK-NEXT: store i32 42, ptr [[ELEM0]], align 4 +; CHECK-NEXT: [[ELEM1:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 1 +; CHECK-NEXT: store i32 43, ptr [[ELEM1]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP33:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP35]], i64 [[TMP34]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; +entry: + ; Create a vector of flat pointers + %vec_lds_ptrs = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) @lds_var1, i32 0 + %vec_lds_ptrs1 = insertelement <2 x ptr addrspace(3)> %vec_lds_ptrs, ptr addrspace(3) @lds_var2, i32 1 + %vec_flat_ptrs = addrspacecast <2 x ptr addrspace(3)> %vec_lds_ptrs1 to <2 x ptr> + %elem0 = extractelement <2 x ptr> %vec_flat_ptrs, i32 0 + store i32 42, ptr %elem0, align 4 + %elem1 = extractelement <2 x ptr> %vec_flat_ptrs, i32 1 + store i32 43, ptr %elem1, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. From 22d65d898961e96f0a8340e090ffa34558279eab Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 19 Feb 2025 10:53:03 +0700 Subject: [PATCH 009/220] AMDGPU: Teach isOperandLegal about SALU literal restrictions (#127626) isOperandLegal mostly implemented the VALU operand rules, and largely ignored SALU restrictions. This theoretically avoids folding literals into SALU insts which already have a literal operand. This issue is currently avoided due to a bug in SIFoldOperands; this change will allow using raw operand legality rules. This breaks the formation of s_fmaak_f32 in SIFoldOperands, but it probably should not have been forming there in the first place. TwoAddressInsts or RA should generally handle that, and this only worked by accident. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 28 ++- .../AMDGPU/fold-operands-scalar-fmac.mir | 9 +- .../CodeGen/AMDGPU/fold-sgpr-multi-imm.mir | 199 ++++++++++++++++++ 3 files changed, 226 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ceab6c9dcca34..7dace11d208a0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5931,11 +5931,15 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); - const MachineOperand *UsedLiteral = nullptr; + const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo); - int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); - int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; - if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { + if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) { + const MachineOperand *UsedLiteral = nullptr; + + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; + + // TODO: Be more permissive with frame indexes. if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { if (!LiteralLimit--) return false; @@ -5974,9 +5978,19 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return false; } } - } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() && - isF16PseudoScalarTrans(MI.getOpcode()) && - isInlineConstant(*MO, OpInfo)) { + } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) { + // There can be at most one literal operand, but it can be repeated. + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + const MachineOperand &Op = MI.getOperand(i); + if (!Op.isReg() && !Op.isFI() && + !isInlineConstant(Op, InstDesc.operands()[i]) && + !Op.isIdenticalTo(*MO)) + return false; + } + } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() && + isF16PseudoScalarTrans(MI.getOpcode())) { return false; } diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir index 08693ec9db1d4..2492eb2982aac 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir @@ -133,7 +133,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 %noninlinable, [[COPY]], 1056964608, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -152,7 +153,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], %noninlinable, 1056964608, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -210,7 +212,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], 1056964608, 1234567890, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, %noninlinable, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir index 5f985605c082d..c8afb89aa272a 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir @@ -69,3 +69,202 @@ body: | %0:sreg_32 = S_MOV_B32 63 %1:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def $scc ... + +# GCN-LABEL: name: test_no_fold_literal_already_inline_lhs{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 80 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 70, %0 +--- +name: test_no_fold_literal_already_inline_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_no_fold_literal_already_inline_rhs{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 80 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %0, 70 +--- +name: test_no_fold_literal_already_inline_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 %0, 70, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_literal_inlineimm_lhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 64, 80 +--- +name: test_fold_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 64, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_literal_inlineimm_rhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 80, 64 +--- +name: test_fold_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 %0, 64, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_2x{{$}} +# GCN: %2:sreg_32 = S_ADD_I32 70, %1 +--- +name: test_fold_same_literal_2x +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 70 + %2:sreg_32 = S_ADD_I32 %0, %1, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_lhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 70, %0 +--- +name: test_fold_same_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_ADD_I32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_rhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 %0, 70 +--- +name: test_fold_same_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_ADD_I32 %0, 70, implicit-def $scc +... + + +# GCN-LABEL: name: test_s_cselect_b32_2x_literal_fold{{$}} +# GCN: %2:sreg_32 = S_CSELECT_B32 70, %1, implicit $scc +--- +name: test_s_cselect_b32_2x_literal_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %2:sreg_32 = S_CSELECT_B32 %0, %1, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_literal_lhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 70, %0, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 70, %0, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_literal_rhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 %0, 70, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 %0, 70, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_inlineimm_lhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 64, 80, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 64, %0, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_inlineimm_rhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 80, 64, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 %0, 64, implicit $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_2x_literal_fold{{$}} +# GCN: S_CMP_EQ_U32 70, %1, implicit-def $scc +--- +name: test_s_cmp_b32_2x_literal_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + S_CMP_EQ_U32 %0, %1, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_literal_lhs{{$}} +# GCN: S_CMP_EQ_U32 70, %0, implicit-def $scc +--- +name: test_s_cmp_b32_literal_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_literal_rhs{{$}} +# GCN: S_CMP_EQ_U32 %0, 70, implicit-def $scc +--- +name: test_s_cmp_b32_literal_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 %0, 70, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_inlineimm_lhs{{$}} +# GCN: S_CMP_EQ_U32 64, 80, implicit-def $scc +--- +name: test_s_cmp_b32_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 64, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_inlineimm_rhs{{$}} +# GCN: S_CMP_EQ_U32 80, 64, implicit-def $scc +--- +name: test_s_cmp_b32_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 %0, 64, implicit-def $scc +... From 27e6561d108e8a3c17432b14bb5e8675c22a787b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 18 Feb 2025 19:21:49 -0800 Subject: [PATCH 010/220] [Sparc] Use MCRegister. NFC --- llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp | 10 +++++----- .../lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 3e9fc31d7bfc2..62854ea896179 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -525,7 +525,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToIntPairReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_IntReg); unsigned regIdx = 32; if (Reg >= Sparc::G0 && Reg <= Sparc::G7) @@ -544,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToDoubleReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_FloatReg); unsigned regIdx = Reg - Sparc::F0; if (regIdx % 2 || regIdx > 31) @@ -555,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToQuadReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = 0; switch (Op.Reg.Kind) { default: llvm_unreachable("Unexpected register kind!"); @@ -578,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToCoprocPairReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_CoprocReg); unsigned regIdx = 32; if (Reg >= Sparc::C0 && Reg <= Sparc::C31) @@ -592,7 +592,7 @@ class SparcOperand : public MCParsedAsmOperand { static std::unique_ptr MorphToMEMrr(unsigned Base, std::unique_ptr Op) { - unsigned offsetReg = Op->getReg(); + MCRegister offsetReg = Op->getReg(); Op->Kind = k_MemoryReg; Op->Mem.Base = Base; Op->Mem.OffsetReg = offsetReg; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp index 37503f4bc2ae2..f2a61c95fefb5 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp @@ -66,12 +66,12 @@ bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, return false; if (!MI->getOperand(0).isReg()) return false; - switch (MI->getOperand(0).getReg()) { + switch (MI->getOperand(0).getReg().id()) { default: return false; case SP::G0: // jmp $addr | ret | retl if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 8) { - switch(MI->getOperand(1).getReg()) { + switch (MI->getOperand(1).getReg().id()) { default: break; case SP::I7: O << "\tret"; return true; case SP::O7: O << "\tretl"; return true; @@ -115,7 +115,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, const MCOperand &MO = MI->getOperand (opNum); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + MCRegister Reg = MO.getReg(); if (isV9(STI)) printRegName(O, Reg, SP::RegNamesStateReg); else From 8187caf8e3691b47ca5c9849df4cebd2f46a8fea Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 18 Feb 2025 23:42:51 -0500 Subject: [PATCH 011/220] [NFC][AMDGPU] Pre-commit a test case of checking register coalescer on `v_pk_mov_b32` (#127715) This PR serves as a preliminary step, adding a test case for register coalescer on v_pk_mov_b32. It is intended to demonstrate the code changes introduced in an upcoming PR. --- .../AMDGPU/vgpr-remat-v_pk_mov_b32.mir | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir new file mode 100644 index 0000000000000..c8d6bf386078f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=register-coalescer -o - %s | FileCheck %s + +--- +name: test_remat_v_pk_mov_b32 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_remat_v_pk_mov_b32 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_PK_MOV_B32_]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[V_PK_MOV_B32_]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr0 + ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY2]] + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[COPY1]], 8, [[COPY1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] + ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_MOV_B32_]] + bb.0: + liveins: $sgpr0 + %0:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + %1:vreg_64_align2 = COPY %0:vreg_64_align2 + %2:vreg_64_align2 = COPY %0:vreg_64_align2 + %3:sreg_64 = COPY $sgpr0 + $exec = S_MOV_B64_term %3:sreg_64 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %1:vreg_64_align2 = V_PK_ADD_F32 8, %1, 8, %1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %2, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + + bb.2: + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_ENDPGM 0, implicit %0 +... From 6c39ee717f03a0fe28f563d525fa5aff09804ba8 Mon Sep 17 00:00:00 2001 From: tianleliu Date: Tue, 18 Feb 2025 20:45:45 -0800 Subject: [PATCH 012/220] [Driver][MSVC] Pass profile file to lld-link via -lto-sample-profile option (#127442) In SPGO lto mode, linker needs -lto-sample-profile option to set sample profile file. Linux adds this option by transferring fprofile-sample-use to -plugin-opt=sample-profile=, which is alias of lto-sample-profile. (in clang\lib\Driver\ToolChains\CommonArgs.cpp: tools::addLTOOptions()). But clang on Windows misses the transferring. So add it now. --- clang/lib/Driver/ToolChains/MSVC.cpp | 5 +++++ clang/test/Driver/cl-link.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index bae41fc06c036..d5a7fc7e85230 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -232,6 +232,11 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, } } + if (C.getDriver().isUsingLTO()) { + if (Arg *A = tools::getLastProfileSampleUseArg(Args)) + CmdArgs.push_back(Args.MakeArgString(std::string("-lto-sample-profile:") + + A->getValue())); + } Args.AddAllArgValues(CmdArgs, options::OPT__SLASH_link); // Control Flow Guard checks diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c index 9bf8a8137926d..726bc26a64edd 100644 --- a/clang/test/Driver/cl-link.c +++ b/clang/test/Driver/cl-link.c @@ -71,3 +71,6 @@ // RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -fuse-ld=lld -### -fsanitize=address 2>&1 | FileCheck --check-prefix=INFER-LLD %s // INFER-LLD: lld-link // INFER-LLD-NOT: INFERASANLIBS + +// RUN: %clang_cl --target=x86_64-unknown-windows-msvc /Tc%s -flto -fuse-ld=lld -### -fprofile-sample-use=%S/Inputs/file.prof 2>&1 | FileCheck -check-prefix=CHECK-SAMPLE-PROFILE %s +// CHECK-SAMPLE-PROFILE: "-lto-sample-profile:{{.*}}/file.prof" From 8b284dc31070b9d1d99c593146da6248a5ca545d Mon Sep 17 00:00:00 2001 From: Hongren Zheng Date: Wed, 19 Feb 2025 12:49:40 +0800 Subject: [PATCH 013/220] [mlir] Fix FunctionOpInterface impl for external func (#124693) For function declarations (i.e. func op has no entry block), the FunctionOpInterface method `insertArgument` and `eraseArgument` will cause segfault. This PR guards against manipulation of empty entry block by checking whether func op is external. An example can be seen in https://github.com/google/heir/pull/1324 The segfault trace ``` #1 0x0000560f1289d9db PrintStackTraceSignalHandler(void*) /proc/self/cwd/external/llvm-project/llvm/lib/Support/Unix/Signals.inc:874:1 #2 0x0000560f1289b116 llvm::sys::RunSignalHandlers() /proc/self/cwd/external/llvm-project/llvm/lib/Support/Signals.cpp:105:5 #3 0x0000560f1289e145 SignalHandler(int) /proc/self/cwd/external/llvm-project/llvm/lib/Support/Unix/Signals.inc:415:1 #4 0x00007f829a3d9520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520) #5 0x0000560f1257f8bc void __gnu_cxx::new_allocator::construct(mlir::BlockArgument*, mlir::BlockArgument&&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/ext/new_allocator.h:162:23 #6 0x0000560f1257f84d void std::allocator_traits >::construct(std::allocator&, mlir::BlockArgument*, mlir::BlockArgument&&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/alloc_traits.h:520:2 #7 0x0000560f12580498 void std::vector >::_M_insert_aux(__gnu_cxx::__normal_iterator > >, mlir::BlockArgument&&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/vector.tcc:405:7 #8 0x0000560f1257cf7e std::vector >::insert(__gnu_cxx::__normal_iterator > >, mlir::BlockArgument const&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/vector.tcc:154:6 #9 0x0000560f1257b349 mlir::Block::insertArgument(unsigned int, mlir::Type, mlir::Location) /proc/self/cwd/external/llvm-project/mlir/lib/IR/Block.cpp:178:13 #10 0x0000560f123d2a1c mlir::function_interface_impl::insertFunctionArguments(mlir::FunctionOpInterface, llvm::ArrayRef, mlir::TypeRange, llvm::ArrayRef, llvm::ArrayRef, unsigned int, mlir::Type) /proc/self/cwd/external/llvm-project/mlir/lib/Interfaces/FunctionInterfaces.cpp:232:11 #11 0x0000560f0be6b727 mlir::detail::FunctionOpInterfaceTrait::insertArguments(llvm::ArrayRef, mlir::TypeRange, llvm::ArrayRef, llvm::ArrayRef) /proc/self/cwd/bazel-out/k8-dbg/bin/external/llvm-project/mlir/include/mlir/Interfaces/FunctionInterfaces.h.inc:809:7 #12 0x0000560f0be6b536 mlir::detail::FunctionOpInterfaceTrait::insertArgument(unsigned int, mlir::Type, mlir::DictionaryAttr, mlir::Location) /proc/self/cwd/bazel-out/k8-dbg/bin/external/llvm-project/mlir/include/mlir/Interfaces/FunctionInterfaces.h.inc:796:7 ``` --- mlir/lib/Interfaces/FunctionInterfaces.cpp | 26 ++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Interfaces/FunctionInterfaces.cpp b/mlir/lib/Interfaces/FunctionInterfaces.cpp index 80f47a3f83676..57a8668117c68 100644 --- a/mlir/lib/Interfaces/FunctionInterfaces.cpp +++ b/mlir/lib/Interfaces/FunctionInterfaces.cpp @@ -199,8 +199,7 @@ void function_interface_impl::insertFunctionArguments( // There are 3 things that need to be updated: // - Function type. // - Arg attrs. - // - Block arguments of entry block. - Block &entry = op->getRegion(0).front(); + // - Block arguments of entry block, if not empty. // Update the argument attributes of the function. ArrayAttr oldArgAttrs = op.getArgAttrsAttr(); @@ -226,10 +225,15 @@ void function_interface_impl::insertFunctionArguments( setAllArgAttrDicts(op, newArgAttrs); } - // Update the function type and any entry block arguments. + // Update the function type. op.setFunctionTypeAttr(TypeAttr::get(newType)); - for (unsigned i = 0, e = argIndices.size(); i < e; ++i) - entry.insertArgument(argIndices[i] + i, argTypes[i], argLocs[i]); + + // Update entry block arguments, if not empty. + if (!op.isExternal()) { + Block &entry = op->getRegion(0).front(); + for (unsigned i = 0, e = argIndices.size(); i < e; ++i) + entry.insertArgument(argIndices[i] + i, argTypes[i], argLocs[i]); + } } void function_interface_impl::insertFunctionResults( @@ -279,8 +283,7 @@ void function_interface_impl::eraseFunctionArguments( // There are 3 things that need to be updated: // - Function type. // - Arg attrs. - // - Block arguments of entry block. - Block &entry = op->getRegion(0).front(); + // - Block arguments of entry block, if not empty. // Update the argument attributes of the function. if (ArrayAttr argAttrs = op.getArgAttrsAttr()) { @@ -292,9 +295,14 @@ void function_interface_impl::eraseFunctionArguments( setAllArgAttrDicts(op, newArgAttrs); } - // Update the function type and any entry block arguments. + // Update the function type. op.setFunctionTypeAttr(TypeAttr::get(newType)); - entry.eraseArguments(argIndices); + + // Update entry block arguments, if not empty. + if (!op.isExternal()) { + Block &entry = op->getRegion(0).front(); + entry.eraseArguments(argIndices); + } } void function_interface_impl::eraseFunctionResults( From 960b80c9f89eef9203ccd46c449008958eb3d5d9 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 18 Feb 2025 21:19:14 -0800 Subject: [PATCH 014/220] [ELF] Improve INPUT_SECTION_FLAGS test --- lld/test/ELF/input-section-flags.s | 1 + 1 file changed, 1 insertion(+) diff --git a/lld/test/ELF/input-section-flags.s b/lld/test/ELF/input-section-flags.s index 0c8e31c77b0dc..f848d55e6fddc 100644 --- a/lld/test/ELF/input-section-flags.s +++ b/lld/test/ELF/input-section-flags.s @@ -15,6 +15,7 @@ # RUN: .outsec3 : { INPUT_SECTION_FLAGS(SHF_WRITE) *(.sec.*) } \ # RUN: .outsec4 : { INPUT_SECTION_FLAGS(SHF_MERGE & !SHF_STRINGS) *(.sec.*) } \ # RUN: .outsec5 : { INPUT_SECTION_FLAGS(SHF_STRINGS) *(.sec.*) } \ +# RUN: .outsec6 : { INPUT_SECTION_FLAGS(!SHF_TLS & !SHF_EXCLUDE & !SHF_COMPRESSED & !SHF_ARM_PURECODE) *(.sec.*) } \ # RUN: } " > %t.script # RUN: ld.lld -o %t1 --script %t.script %t.o # RUN: llvm-readobj --symbols %t1 | FileCheck %s From a44284c02f39f68a754471a7e00b61ebf448d271 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 19 Feb 2025 00:51:57 -0500 Subject: [PATCH 015/220] [AMDGPU] Add `isAsCheapAsAMove` for `v_pk_mov_b32` (#127632) Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 +- .../CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 21898da1912f5..d5c6e8af109f4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1151,7 +1151,7 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile, any_fadd>; } // End SubtargetPredicate = HasPackedFP32Ops - let SubtargetPredicate = HasPkMovB32 in + let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile>; } // End isCommutable = 1, isReMaterializable = 1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir index c8d6bf386078f..9af18758e2206 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir @@ -11,22 +11,22 @@ body: | ; CHECK-NEXT: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_PK_MOV_B32_]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[V_PK_MOV_B32_]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr0 - ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY2]] + ; CHECK-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0 + ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]] ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[COPY]], 8, [[COPY]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[COPY1]], 8, [[COPY1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_MOV_B32_1]], 8, [[V_PK_MOV_B32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_MOV_B32_2]], 8, [[V_PK_MOV_B32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_1]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_2]] ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_MOV_B32_]] bb.0: liveins: $sgpr0 From fb394451ca8de4536ba9a8aca2144248bc3fdea8 Mon Sep 17 00:00:00 2001 From: LiqinWeng Date: Wed, 19 Feb 2025 14:11:16 +0800 Subject: [PATCH 016/220] [RISCV][VLOPT] Add vfsqrt/vfrsqrt7 instruction to isSupportInstr (#127462) --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 4 ++ .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 6 +-- llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll | 48 +++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vl-opt.mir | 40 ++++++++++++++++ 4 files changed, 94 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index e5a98598370ec..66b989a84b1ce 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -1092,6 +1092,10 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VFWNMSAC_VF: case RISCV::VFWMACCBF16_VV: case RISCV::VFWMACCBF16_VF: + // Vector Floating-Point Square-Root Instruction + case RISCV::VFSQRT_V: + // Vector Floating-Point Reciprocal Square-Root Estimate Instruction + case RISCV::VFRSQRT7_V: // Vector Floating-Point MIN/MAX Instructions case RISCV::VFMIN_VF: case RISCV::VFMIN_VV: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 585a331e55094..bef29dfecef4c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -1318,11 +1318,10 @@ define void @sqrt_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfsqrt.v v8, v10 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) ; CHECK-NEXT: ret @@ -1371,11 +1370,10 @@ define void @sqrt_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v8, v10 -; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index c6ee9e34dc207..5cd9b77af82cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -5069,3 +5069,51 @@ define @vfwmaccbf16_vf( %a, bfloat %b, %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) ret %2 } + +define @vfsqrt( %a) { +; NOVLOPT-LABEL: vfsqrt: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: fsrmi a0, 0 +; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; NOVLOPT-NEXT: vfsqrt.v v10, v8 +; NOVLOPT-NEXT: fsrm a0 +; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; NOVLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; NOVLOPT-NEXT: vmv4r.v v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfsqrt: +; VLOPT: # %bb.0: +; VLOPT-NEXT: fsrmi a0, 0 +; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; VLOPT-NEXT: vfsqrt.v v10, v8 +; VLOPT-NEXT: fsrm a0 +; VLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; VLOPT-NEXT: vmv4r.v v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfsqrt.nxv4f32( poison, %a, iXLen 0, iXLen 7) + %2 = call @llvm.riscv.vfwmacc( poison, %a, %1, iXLen 7, iXLen 6, iXLen 0) + ret %2 +} + +define @vfrsqrt7( %a) { +; NOVLOPT-LABEL: vfrsqrt7: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; NOVLOPT-NEXT: vfrsqrt7.v v10, v8 +; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; NOVLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; NOVLOPT-NEXT: vmv4r.v v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfrsqrt7: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; VLOPT-NEXT: vfrsqrt7.v v10, v8 +; VLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; VLOPT-NEXT: vmv4r.v v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfrsqrt7.nxv4f32( poison, %a, iXLen 7) + %2 = call @llvm.riscv.vfwmacc( poison, %a, %1, iXLen 7, iXLen 6, iXLen 0) + ret %2 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 0475a988e9851..cb43a89ea3bc6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -141,6 +141,46 @@ body: | %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 ... --- +name: vfsqrt_nofpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfsqrt_nofpexcept + ; CHECK: %x:vrm2 = nofpexcept PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 6, 5 /* e32 */, 3 /* ta, ma */, implicit $frm + ; CHECK-NEXT: early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4 /* e16 */, 3 /* ta, ma */, implicit $frm + %x:vrm2 = nofpexcept PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5, 3, implicit $frm + early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4, 3, implicit $frm +... +--- +name: vfsqrt_fpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfsqrt_fpexcept + ; CHECK: %x:vrm2 = PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5 /* e32 */, 3 /* ta, ma */, implicit $frm + ; CHECK-NEXT: early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4 /* e16 */, 3 /* ta, ma */, implicit $frm + %x:vrm2 = PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5, 3, implicit $frm + early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4, 3, implicit $frm +... +--- +name: vfrsqrt7_nofpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfrsqrt7_nofpexcept + ; CHECK: %x:vrm2 = nofpexcept PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vrm2 = nofpexcept PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5, 0 + %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- +name: vfrsqrt7_fpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfrsqrt7_fpexcept + ; CHECK: %x:vrm2 = PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vrm2 = PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5, 0 + %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- name: vwadd_tied_vs1 body: | bb.0: From 1c02c8f6fcbea1750e0c50c8a22af29b19588d72 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 19 Feb 2025 07:41:27 +0100 Subject: [PATCH 017/220] [clang] fix use after free in clang/tools/c-index-test/c-index-test.c (#127063) recent change e76739eeb952940b2979c70ba44a28fecf592695 has exposed use after free in GetCursorSource() function that returned pointer to a disposed CXString --- clang/tools/c-index-test/c-index-test.c | 51 ++++++++++++++++--------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c index a9d8261bd03e7..fed6fe0736904 100644 --- a/clang/tools/c-index-test/c-index-test.c +++ b/clang/tools/c-index-test/c-index-test.c @@ -1213,7 +1213,21 @@ static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) { } } -static const char* GetCursorSource(CXCursor Cursor) { +static CXString createCXString(const char *CS) { + CXString Str; + Str.data = (const void *)CS; + Str.private_flags = 0; + return Str; +} + +static CXString duplicateCXString(const char *CS) { + CXString Str; + Str.data = strdup(CS); + Str.private_flags = 1; // CXS_Malloc + return Str; +} + +static CXString GetCursorSource(CXCursor Cursor) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); CXString source; CXFile file; @@ -1221,20 +1235,12 @@ static const char* GetCursorSource(CXCursor Cursor) { source = clang_getFileName(file); if (!clang_getCString(source)) { clang_disposeString(source); - return ""; + return createCXString(""); } - else { - const char *b = basename(clang_getCString(source)); - clang_disposeString(source); - return b; - } -} - -static CXString createCXString(const char *CS) { - CXString Str; - Str.data = (const void *) CS; - Str.private_flags = 0; - return Str; + const char *b = basename(clang_getCString(source)); + CXString result = duplicateCXString(b); + clang_disposeString(source); + return result; } /******************************************************************************/ @@ -1358,8 +1364,10 @@ enum CXChildVisitResult FilteredPrintingVisitor(CXCursor Cursor, CXSourceLocation Loc = clang_getCursorLocation(Cursor); unsigned line, column; clang_getFileLocation(Loc, 0, &line, &column, 0); - printf("// %s: %s:%d:%d: ", FileCheckPrefix, - GetCursorSource(Cursor), line, column); + CXString source = GetCursorSource(Cursor); + printf("// %s: %s:%d:%d: ", FileCheckPrefix, clang_getCString(source), line, + column); + clang_disposeString(source); PrintCursor(Cursor, Data->CommentSchemaFile); PrintCursorExtent(Cursor); if (clang_isDeclaration(Cursor.kind)) { @@ -1428,8 +1436,10 @@ static enum CXChildVisitResult FunctionScanVisitor(CXCursor Cursor, if (Ref.kind == CXCursor_NoDeclFound) { /* Nothing found here; that's fine. */ } else if (Ref.kind != CXCursor_FunctionDecl) { - printf("// %s: %s:%d:%d: ", FileCheckPrefix, GetCursorSource(Ref), - curLine, curColumn); + CXString CursorSource = GetCursorSource(Ref); + printf("// %s: %s:%d:%d: ", FileCheckPrefix, + clang_getCString(CursorSource), curLine, curColumn); + clang_disposeString(CursorSource); PrintCursor(Ref, Data->CommentSchemaFile); printf("\n"); } @@ -1455,7 +1465,10 @@ enum CXChildVisitResult USRVisitor(CXCursor C, CXCursor parent, clang_disposeString(USR); return CXChildVisit_Recurse; } - printf("// %s: %s %s", FileCheckPrefix, GetCursorSource(C), cstr); + CXString CursorSource = GetCursorSource(C); + printf("// %s: %s %s", FileCheckPrefix, clang_getCString(CursorSource), + cstr); + clang_disposeString(CursorSource); PrintCursorExtent(C); printf("\n"); From 6b67aac31377992465c419b5a296f9a7ba8e7984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ingo=20M=C3=BCller?= Date: Wed, 19 Feb 2025 08:07:49 +0100 Subject: [PATCH 018/220] [mlir:python] Improve `mlir_(attribute|type|value)_subclass` for `nanobind`s `stubgen` (#127584) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR makes several improvements to the stubs that are created by `mlir_(attribute|type|value)_subclass`. First, the PR sets the `__module__` attribute of the classes generated by the nanobind adaptors for attributes, types, and values (via `mlir_(attribute|type|value)_subclass`). By default, the `__module__` property is set to `importlib._bootstrap`, which isn't where we want the new class to live. The new logic sets the property to the name of the module provided as `scope` instead. This also makes nanobind's `stubgen` generate stubs for those classes properly, which ignores classes whose `__module__` does not correspond to the module it is generating stubs for. This resolves #127518. Second, the PR overwrites the function signatures generated by `stubgen` to a format that uses the desired type names (e.g., `mlir.ir.Attribute` instead of `MlirAttribute`). Finally, the PR piggy-backs some minor doc and style improvements to `PythonAdaptors.h`. --------- Signed-off-by: Ingo Müller --- .../mlir/Bindings/Python/NanobindAdaptors.h | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h index 517351cac6dbc..0608182f00b7e 100644 --- a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h +++ b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h @@ -23,8 +23,10 @@ #include "mlir-c/Diagnostics.h" #include "mlir-c/IR.h" +// clang-format off #include "mlir/Bindings/Python/Nanobind.h" #include "mlir-c/Bindings/Python/Interop.h" // This is expected after nanobind. +// clang-format on #include "llvm/ADT/Twine.h" // Raw CAPI type casters need to be declared before use, so always include them @@ -349,6 +351,7 @@ class pure_subclass { thisClass = metaclass(derivedClassName, nanobind::make_tuple(superClass), attributes); scope.attr(derivedClassName) = thisClass; + thisClass.attr("__module__") = scope.attr("__name__"); } template @@ -434,7 +437,7 @@ class mlir_attribute_subclass : public pure_subclass { const nanobind::object &superCls, GetTypeIDFunctionTy getTypeIDFunction = nullptr) : pure_subclass(scope, typeClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -465,10 +468,13 @@ class mlir_attribute_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_attribute: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Attribute) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirAttribute other) { return isaFunction(other); }, - nanobind::arg("other_attribute")); + nanobind::arg("other_attribute"), nanobind::sig(kIsinstanceSig)); def("__repr__", [superCls, captureTypeName](nanobind::object self) { return nanobind::repr(superCls(self)) .attr("replace")(superCls.attr("__name__"), captureTypeName); @@ -512,7 +518,7 @@ class mlir_type_subclass : public pure_subclass { const nanobind::object &superCls, GetTypeIDFunctionTy getTypeIDFunction = nullptr) : pure_subclass(scope, typeClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -542,13 +548,17 @@ class mlir_type_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_type: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Type) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirType other) { return isaFunction(other); }, - nanobind::arg("other_type")); + nanobind::arg("other_type"), nanobind::sig(kIsinstanceSig)); def("__repr__", [superCls, captureTypeName](nanobind::object self) { - return nanobind::repr(superCls(self)) - .attr("replace")(superCls.attr("__name__"), captureTypeName); + return nanobind::cast( + nanobind::repr(superCls(self)) + .attr("replace")(superCls.attr("__name__"), captureTypeName)); }); if (getTypeIDFunction) { // 'get_static_typeid' method. @@ -590,7 +600,7 @@ class mlir_value_subclass : public pure_subclass { IsAFunctionTy isaFunction, const nanobind::object &superCls) : pure_subclass(scope, valueClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -620,10 +630,13 @@ class mlir_value_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_value: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Value) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirValue other) { return isaFunction(other); }, - nanobind::arg("other_value")); + nanobind::arg("other_value"), nanobind::sig(kIsinstanceSig)); } }; From 8f41d28d89ee287d0f5a6518116ab316be2657b8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 18 Feb 2025 23:18:08 -0800 Subject: [PATCH 019/220] [c-index-test] Fix warnings This patch fixes: clang/tools/c-index-test/c-index-test.c:1240:15: error: mixing declarations and code is a C99 extension [-Werror,-Wdeclaration-after-statement] clang/tools/c-index-test/c-index-test.c:1367:14: error: mixing declarations and code is a C99 extension [-Werror,-Wdeclaration-after-statement] clang/tools/c-index-test/c-index-test.c:1468:14: error: mixing declarations and code is a C99 extension [-Werror,-Wdeclaration-after-statement] --- clang/tools/c-index-test/c-index-test.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c index fed6fe0736904..0e7de8b98ea07 100644 --- a/clang/tools/c-index-test/c-index-test.c +++ b/clang/tools/c-index-test/c-index-test.c @@ -1231,14 +1231,16 @@ static CXString GetCursorSource(CXCursor Cursor) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); CXString source; CXFile file; + const char *b; + CXString result; clang_getExpansionLocation(Loc, &file, 0, 0, 0); source = clang_getFileName(file); if (!clang_getCString(source)) { clang_disposeString(source); return createCXString(""); } - const char *b = basename(clang_getCString(source)); - CXString result = duplicateCXString(b); + b = basename(clang_getCString(source)); + result = duplicateCXString(b); clang_disposeString(source); return result; } @@ -1363,8 +1365,9 @@ enum CXChildVisitResult FilteredPrintingVisitor(CXCursor Cursor, if (!Data->Filter || (Cursor.kind == *(enum CXCursorKind *)Data->Filter)) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); unsigned line, column; + CXString source; clang_getFileLocation(Loc, 0, &line, &column, 0); - CXString source = GetCursorSource(Cursor); + source = GetCursorSource(Cursor); printf("// %s: %s:%d:%d: ", FileCheckPrefix, clang_getCString(source), line, column); clang_disposeString(source); @@ -1461,11 +1464,12 @@ enum CXChildVisitResult USRVisitor(CXCursor C, CXCursor parent, if (!Data->Filter || (C.kind == *(enum CXCursorKind *)Data->Filter)) { CXString USR = clang_getCursorUSR(C); const char *cstr = clang_getCString(USR); + CXString CursorSource; if (!cstr || cstr[0] == '\0') { clang_disposeString(USR); return CXChildVisit_Recurse; } - CXString CursorSource = GetCursorSource(C); + CursorSource = GetCursorSource(C); printf("// %s: %s %s", FileCheckPrefix, clang_getCString(CursorSource), cstr); clang_disposeString(CursorSource); From c4f8da94a143b954de975091b3ec563bb01b0837 Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Wed, 19 Feb 2025 08:46:53 +0100 Subject: [PATCH 020/220] [MLIR][emitc]: Remove unused functions (NFC) --- mlir/lib/Dialect/EmitC/IR/EmitC.cpp | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp index 1e8952a7edf4e..eb7ffe2e032c4 100644 --- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp +++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp @@ -991,27 +991,6 @@ LogicalResult emitc::VerbatimOp::verify() { return success(); } -[[maybe_unused]] static ParseResult -parseVariadicTypeFmtArgs(AsmParser &p, SmallVector ¶ms) { - Type type; - if (p.parseType(type)) - return failure(); - - params.push_back(type); - while (succeeded(p.parseOptionalComma())) { - if (p.parseType(type)) - return failure(); - params.push_back(type); - } - - return success(); -} - -[[maybe_unused]] static void printVariadicTypeFmtArgs(AsmPrinter &p, - ArrayRef params) { - llvm::interleaveComma(params, p, [&](Type type) { p.printType(type); }); -} - FailureOr> emitc::VerbatimOp::parseFormatString() { // Error checking is done in verify. return ::parseFormatString(getValue(), getFmtArgs()); From 9072ba71cac6d518b4164615c609e358d49c4ed2 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Tue, 18 Feb 2025 23:54:45 -0800 Subject: [PATCH 021/220] [libc] Add strftime_l (#127708) This is a (no-op) locale version of strftime. --- libc/config/baremetal/aarch64/entrypoints.txt | 2 ++ libc/config/baremetal/arm/entrypoints.txt | 2 ++ libc/config/baremetal/riscv/entrypoints.txt | 2 ++ libc/config/linux/x86_64/entrypoints.txt | 1 + libc/include/time.yaml | 11 +++++++ libc/src/time/CMakeLists.txt | 14 ++++++++ libc/src/time/strftime.cpp | 1 - libc/src/time/strftime_l.cpp | 33 +++++++++++++++++++ libc/src/time/strftime_l.h | 25 ++++++++++++++ 9 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 libc/src/time/strftime_l.cpp create mode 100644 libc/src/time/strftime_l.h diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt index 44c4ab49e5c58..2c226ef176c08 100644 --- a/libc/config/baremetal/aarch64/entrypoints.txt +++ b/libc/config/baremetal/aarch64/entrypoints.txt @@ -248,6 +248,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.gmtime libc.src.time.gmtime_r libc.src.time.mktime + libc.src.time.strftime + libc.src.time.strftime_l libc.src.time.timespec_get # internal entrypoints diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 370b5462fe9e8..6fd1fce3ab245 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -248,6 +248,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.gmtime libc.src.time.gmtime_r libc.src.time.mktime + libc.src.time.strftime + libc.src.time.strftime_l libc.src.time.timespec_get # internal entrypoints diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 07311a60a17a2..5985c495bdaf2 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -244,6 +244,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.gmtime libc.src.time.gmtime_r libc.src.time.mktime + libc.src.time.strftime + libc.src.time.strftime_l libc.src.time.timespec_get # internal entrypoints diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index a4f6671a59789..2e3af00ec303d 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1128,6 +1128,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.mktime libc.src.time.nanosleep libc.src.time.strftime + libc.src.time.strftime_l libc.src.time.time libc.src.time.timespec_get diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 37ee824678cda..7bb25dbe85ac4 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -9,6 +9,7 @@ types: - type_name: time_t - type_name: clock_t - type_name: size_t + - type_name: locale_t enums: [] objects: [] functions: @@ -100,6 +101,16 @@ functions: - type: size_t - type: const char *__restrict - type: const struct tm *__restrict + - name: strftime_l + standard: + - stdc + return_type: size_t + arguments: + - type: char *__restrict + - type: size_t + - type: const char *__restrict + - type: const struct tm *__restrict + - type: locale_t - name: time standard: - stdc diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt index 8332e8ab66f97..3b951df810011 100644 --- a/libc/src/time/CMakeLists.txt +++ b/libc/src/time/CMakeLists.txt @@ -150,6 +150,20 @@ add_entrypoint_object( libc.src.time.strftime_core.strftime_main ) +add_entrypoint_object( + strftime_l + SRCS + strftime_l.cpp + HDRS + strftime_l.h + DEPENDS + libc.hdr.types.locale_t + libc.hdr.types.size_t + libc.hdr.types.struct_tm + libc.src.stdio.printf_core.writer + libc.src.time.strftime_core.strftime_main +) + add_entrypoint_object( time SRCS diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp index 4b89bf2ea3a70..c19e58fbadf71 100644 --- a/libc/src/time/strftime.cpp +++ b/libc/src/time/strftime.cpp @@ -19,7 +19,6 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(size_t, strftime, (char *__restrict buffer, size_t buffsz, const char *__restrict format, const tm *timeptr)) { - printf_core::WriteBuffer wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(&wb); int ret = strftime_core::strftime_main(&writer, format, timeptr); diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp new file mode 100644 index 0000000000000..4203136af4cba --- /dev/null +++ b/libc/src/time/strftime_l.cpp @@ -0,0 +1,33 @@ +//===-- Implementation of strftime_l function -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/time/strftime_l.h" +#include "hdr/types/size_t.h" +#include "hdr/types/struct_tm.h" +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" +#include "src/stdio/printf_core/writer.h" +#include "src/time/strftime_core/strftime_main.h" + +namespace LIBC_NAMESPACE_DECL { + +// TODO: Add support for locales. +LLVM_LIBC_FUNCTION(size_t, strftime_l, + (char *__restrict buffer, size_t count, + const char *__restrict format, + const struct tm *__restrict tp, locale_t)) { + printf_core::WriteBuffer wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); + printf_core::Writer writer(&wb); + int ret = strftime_core::strftime_main(&writer, format, timeptr); + if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. + wb.buff[wb.buff_cur] = '\0'; + return (ret < 0 || static_cast(ret) > buffsz) ? 0 : ret; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_l.h b/libc/src/time/strftime_l.h new file mode 100644 index 0000000000000..158c4a63986b4 --- /dev/null +++ b/libc/src/time/strftime_l.h @@ -0,0 +1,25 @@ +//===-- Implementation header for strftime_l --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_TIME_STRFTIME_L_H +#define LLVM_LIBC_SRC_TIME_STRFTIME_L_H + +#include "hdr/types/size_t.h" +#include "hdr/types/struct_tm.h" +#include "include/llvm-libc-types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t strftime_l(char *__restrict buffer, size_t count, + const char *__restrict format, const struct tm *__restrict tp, + locale_t); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_TIME_STRFTIME_L_H From 850062cf498a7aa408e0ff071c4c52ad5c135355 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 19 Feb 2025 09:16:56 +0100 Subject: [PATCH 022/220] [AA] Consider extractvalue and extractelement as escape sources (#127640) CaptureTracking considers insertions into aggregates and vectors as captures. As such, extractions from aggregates and vectors are escape sources. A non-escaping identified local cannot alias with the result of an extractvalue/extractelement. Fixes https://github.com/llvm/llvm-project/issues/126670. --- llvm/lib/Analysis/AliasAnalysis.cpp | 6 +++++ .../BasicAA/escape-source-aggregate.ll | 27 +++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 58297accc7f1f..aa72fb844ef32 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -859,6 +859,12 @@ bool llvm::isEscapeSource(const Value *V) { if (isa(V)) return true; + // Capture tracking considers insertions into aggregates and vectors as + // captures. As such, extractions from aggregates and vectors are escape + // sources. + if (isa(V)) + return true; + // Same for inttoptr constant expressions. if (auto *CE = dyn_cast(V)) if (CE->getOpcode() == Instruction::IntToPtr) diff --git a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll index cef11b94f3873..872715f31011d 100644 --- a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll +++ b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll @@ -2,8 +2,9 @@ declare { ptr, i1 } @get_struct() declare <2 x ptr> @get_vec() +declare void @escape(ptr) -; CHECK: MayAlias: i32* %a, i32* %extract +; CHECK: NoAlias: i32* %a, i32* %extract define i32 @test_extractvalue() { %a = alloca i32 %call = call { ptr, i1 } @get_struct() @@ -13,7 +14,7 @@ define i32 @test_extractvalue() { ret i32 %v } -; CHECK: MayAlias: i32* %a, i32* %extract +; CHECK: NoAlias: i32* %a, i32* %extract define i32 @test_extractelement() { %a = alloca i32 %call = call <2 x ptr> @get_vec() @@ -22,3 +23,25 @@ define i32 @test_extractelement() { %v = load i32, ptr %a ret i32 %v } + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractvalue_escape() { + %a = alloca i32 + call void @escape(ptr %a) + %call = call { ptr, i1 } @get_struct() + %extract = extractvalue { ptr, i1 } %call, 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractelement_escape() { + %a = alloca i32 + call void @escape(ptr %a) + %call = call <2 x ptr> @get_vec() + %extract = extractelement <2 x ptr> %call, i32 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} From a2b4d4e756ff2710e15a6378e337fb5888e5ac36 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Wed, 19 Feb 2025 00:39:33 -0800 Subject: [PATCH 023/220] Revert "[libc] Add strftime_l" (#127766) Reverts llvm/llvm-project#127708 --- libc/config/baremetal/aarch64/entrypoints.txt | 2 -- libc/config/baremetal/arm/entrypoints.txt | 2 -- libc/config/baremetal/riscv/entrypoints.txt | 2 -- libc/config/linux/x86_64/entrypoints.txt | 1 - libc/include/time.yaml | 11 ------- libc/src/time/CMakeLists.txt | 14 -------- libc/src/time/strftime.cpp | 1 + libc/src/time/strftime_l.cpp | 33 ------------------- libc/src/time/strftime_l.h | 25 -------------- 9 files changed, 1 insertion(+), 90 deletions(-) delete mode 100644 libc/src/time/strftime_l.cpp delete mode 100644 libc/src/time/strftime_l.h diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt index 2c226ef176c08..44c4ab49e5c58 100644 --- a/libc/config/baremetal/aarch64/entrypoints.txt +++ b/libc/config/baremetal/aarch64/entrypoints.txt @@ -248,8 +248,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.gmtime libc.src.time.gmtime_r libc.src.time.mktime - libc.src.time.strftime - libc.src.time.strftime_l libc.src.time.timespec_get # internal entrypoints diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 6fd1fce3ab245..370b5462fe9e8 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -248,8 +248,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.gmtime libc.src.time.gmtime_r libc.src.time.mktime - libc.src.time.strftime - libc.src.time.strftime_l libc.src.time.timespec_get # internal entrypoints diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 5985c495bdaf2..07311a60a17a2 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -244,8 +244,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.gmtime libc.src.time.gmtime_r libc.src.time.mktime - libc.src.time.strftime - libc.src.time.strftime_l libc.src.time.timespec_get # internal entrypoints diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 2e3af00ec303d..a4f6671a59789 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1128,7 +1128,6 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.mktime libc.src.time.nanosleep libc.src.time.strftime - libc.src.time.strftime_l libc.src.time.time libc.src.time.timespec_get diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 7bb25dbe85ac4..37ee824678cda 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -9,7 +9,6 @@ types: - type_name: time_t - type_name: clock_t - type_name: size_t - - type_name: locale_t enums: [] objects: [] functions: @@ -101,16 +100,6 @@ functions: - type: size_t - type: const char *__restrict - type: const struct tm *__restrict - - name: strftime_l - standard: - - stdc - return_type: size_t - arguments: - - type: char *__restrict - - type: size_t - - type: const char *__restrict - - type: const struct tm *__restrict - - type: locale_t - name: time standard: - stdc diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt index 3b951df810011..8332e8ab66f97 100644 --- a/libc/src/time/CMakeLists.txt +++ b/libc/src/time/CMakeLists.txt @@ -150,20 +150,6 @@ add_entrypoint_object( libc.src.time.strftime_core.strftime_main ) -add_entrypoint_object( - strftime_l - SRCS - strftime_l.cpp - HDRS - strftime_l.h - DEPENDS - libc.hdr.types.locale_t - libc.hdr.types.size_t - libc.hdr.types.struct_tm - libc.src.stdio.printf_core.writer - libc.src.time.strftime_core.strftime_main -) - add_entrypoint_object( time SRCS diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp index c19e58fbadf71..4b89bf2ea3a70 100644 --- a/libc/src/time/strftime.cpp +++ b/libc/src/time/strftime.cpp @@ -19,6 +19,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(size_t, strftime, (char *__restrict buffer, size_t buffsz, const char *__restrict format, const tm *timeptr)) { + printf_core::WriteBuffer wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(&wb); int ret = strftime_core::strftime_main(&writer, format, timeptr); diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp deleted file mode 100644 index 4203136af4cba..0000000000000 --- a/libc/src/time/strftime_l.cpp +++ /dev/null @@ -1,33 +0,0 @@ -//===-- Implementation of strftime_l function -----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/time/strftime_l.h" -#include "hdr/types/size_t.h" -#include "hdr/types/struct_tm.h" -#include "include/llvm-libc-types/locale_t.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/stdio/printf_core/writer.h" -#include "src/time/strftime_core/strftime_main.h" - -namespace LIBC_NAMESPACE_DECL { - -// TODO: Add support for locales. -LLVM_LIBC_FUNCTION(size_t, strftime_l, - (char *__restrict buffer, size_t count, - const char *__restrict format, - const struct tm *__restrict tp, locale_t)) { - printf_core::WriteBuffer wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); - printf_core::Writer writer(&wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); - if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. - wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) > buffsz) ? 0 : ret; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_l.h b/libc/src/time/strftime_l.h deleted file mode 100644 index 158c4a63986b4..0000000000000 --- a/libc/src/time/strftime_l.h +++ /dev/null @@ -1,25 +0,0 @@ -//===-- Implementation header for strftime_l --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_TIME_STRFTIME_L_H -#define LLVM_LIBC_SRC_TIME_STRFTIME_L_H - -#include "hdr/types/size_t.h" -#include "hdr/types/struct_tm.h" -#include "include/llvm-libc-types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -size_t strftime_l(char *__restrict buffer, size_t count, - const char *__restrict format, const struct tm *__restrict tp, - locale_t); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_TIME_STRFTIME_L_H From b9a1e58b2c63c9c0ef775d97d6728b64083a2239 Mon Sep 17 00:00:00 2001 From: Vladislav Belov Date: Wed, 19 Feb 2025 11:39:55 +0300 Subject: [PATCH 024/220] [RISCVISel] Compute leading zeros for RISCVISD::VCPOP_VL node (#127705) This patch adds handling of the RISCVISD::VCPOP_VL node in RISCVTargetLowering::computeKnownBitsForTargetNode. It eliminates redundant zero-extension instructions. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 + .../RISCV/rvv/vcpop-compute-known-bits.ll | 18 ++ .../CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll | 198 ++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 98c25bc93a8a2..28cc136d76ffc 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -19462,6 +19462,11 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.intersectWith(Known2); break; } + case RISCVISD::VCPOP_VL: { + KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); + Known.Zero.setBitsFrom(Known2.countMaxActiveBits()); + break; + } case RISCVISD::CZERO_EQZ: case RISCVISD::CZERO_NEZ: Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll new file mode 100644 index 0000000000000..7c569da9291db --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64 + +define i32 @test(<8 x i1> %mask) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret + %1 = bitcast <8 x i1> %mask to i8 + %2 = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %1) + %3 = zext nneg i8 %2 to i32 + ret i32 %3 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll new file mode 100644 index 0000000000000..16c4ade7fa9cb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64 + +define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef signext %c, i32 noundef signext %n) { +; RV32-LABEL: test_store1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: blez a3, .LBB0_6 +; RV32-NEXT: # %bb.1: # %for.body.preheader +; RV32-NEXT: li a4, 8 +; RV32-NEXT: bltu a3, a4, .LBB0_7 +; RV32-NEXT: # %bb.2: # %for.body.preheader +; RV32-NEXT: sub a4, a0, a1 +; RV32-NEXT: sltu a5, a0, a1 +; RV32-NEXT: neg a5, a5 +; RV32-NEXT: sltiu a4, a4, 32 +; RV32-NEXT: seqz a5, a5 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: bnez a4, .LBB0_7 +; RV32-NEXT: # %bb.3: # %vector.ph +; RV32-NEXT: lui a5, 524288 +; RV32-NEXT: addi a5, a5, -8 +; RV32-NEXT: and a5, a3, a5 +; RV32-NEXT: li a7, 0 +; RV32-NEXT: li a6, 0 +; RV32-NEXT: .LBB0_4: # %vector.body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli t0, a7, 2 +; RV32-NEXT: addi t1, a7, 8 +; RV32-NEXT: add t0, a1, t0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (t0) +; RV32-NEXT: sltu a7, t1, a7 +; RV32-NEXT: xor t0, t1, a5 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: vmslt.vx v10, v8, a2 +; RV32-NEXT: vcompress.vm v12, v8, v10 +; RV32-NEXT: vcpop.m a7, v10 +; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV32-NEXT: vse32.v v12, (a0) +; RV32-NEXT: slli a7, a7, 2 +; RV32-NEXT: or t0, t0, a6 +; RV32-NEXT: add a0, a0, a7 +; RV32-NEXT: mv a7, t1 +; RV32-NEXT: bnez t0, .LBB0_4 +; RV32-NEXT: # %bb.5: # %middle.block +; RV32-NEXT: bne a5, a3, .LBB0_9 +; RV32-NEXT: .LBB0_6: # %for.cond.cleanup +; RV32-NEXT: ret +; RV32-NEXT: .LBB0_7: +; RV32-NEXT: li a5, 0 +; RV32-NEXT: li a4, 0 +; RV32-NEXT: j .LBB0_9 +; RV32-NEXT: .LBB0_8: # %for.inc +; RV32-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV32-NEXT: addi a5, a5, 1 +; RV32-NEXT: seqz a6, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: xor a6, a5, a3 +; RV32-NEXT: or a6, a6, a4 +; RV32-NEXT: beqz a6, .LBB0_6 +; RV32-NEXT: .LBB0_9: # %for.body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli a6, a5, 2 +; RV32-NEXT: add a6, a1, a6 +; RV32-NEXT: lw a6, 0(a6) +; RV32-NEXT: bge a6, a2, .LBB0_8 +; RV32-NEXT: # %bb.10: # %if.then +; RV32-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV32-NEXT: addi a7, a0, 4 +; RV32-NEXT: sw a6, 0(a0) +; RV32-NEXT: mv a0, a7 +; RV32-NEXT: j .LBB0_8 +; +; RV64-LABEL: test_store1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: blez a3, .LBB0_6 +; RV64-NEXT: # %bb.1: # %for.body.preheader +; RV64-NEXT: li a5, 8 +; RV64-NEXT: li a4, 0 +; RV64-NEXT: bltu a3, a5, .LBB0_7 +; RV64-NEXT: # %bb.2: # %for.body.preheader +; RV64-NEXT: sub a5, a0, a1 +; RV64-NEXT: li a6, 31 +; RV64-NEXT: bgeu a6, a5, .LBB0_7 +; RV64-NEXT: # %bb.3: # %vector.ph +; RV64-NEXT: lui a4, 524288 +; RV64-NEXT: addiw a4, a4, -8 +; RV64-NEXT: and a4, a3, a4 +; RV64-NEXT: slli a5, a4, 2 +; RV64-NEXT: add a5, a5, a1 +; RV64-NEXT: mv a6, a1 +; RV64-NEXT: .LBB0_4: # %vector.body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a6) +; RV64-NEXT: addi a6, a6, 32 +; RV64-NEXT: vmslt.vx v10, v8, a2 +; RV64-NEXT: vcompress.vm v12, v8, v10 +; RV64-NEXT: vcpop.m a7, v10 +; RV64-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV64-NEXT: vse32.v v12, (a0) +; RV64-NEXT: slli a7, a7, 2 +; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: bne a6, a5, .LBB0_4 +; RV64-NEXT: # %bb.5: # %middle.block +; RV64-NEXT: bne a4, a3, .LBB0_7 +; RV64-NEXT: .LBB0_6: # %for.cond.cleanup +; RV64-NEXT: ret +; RV64-NEXT: .LBB0_7: # %for.body.preheader13 +; RV64-NEXT: slli a4, a4, 2 +; RV64-NEXT: slli a5, a3, 2 +; RV64-NEXT: add a3, a1, a4 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: j .LBB0_9 +; RV64-NEXT: .LBB0_8: # %for.inc +; RV64-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV64-NEXT: addi a3, a3, 4 +; RV64-NEXT: beq a3, a1, .LBB0_6 +; RV64-NEXT: .LBB0_9: # %for.body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: lw a4, 0(a3) +; RV64-NEXT: bge a4, a2, .LBB0_8 +; RV64-NEXT: # %bb.10: # %if.then +; RV64-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV64-NEXT: addi a5, a0, 4 +; RV64-NEXT: sw a4, 0(a0) +; RV64-NEXT: mv a0, a5 +; RV64-NEXT: j .LBB0_8 +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %dst11 = ptrtoint ptr %dst to i64 + %src12 = ptrtoint ptr %src to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %min.iters.check = icmp ult i32 %n, 8 + %0 = sub i64 %dst11, %src12 + %diff.check = icmp ult i64 %0, 32 + %or.cond = or i1 %min.iters.check, %diff.check + br i1 %or.cond, label %for.body.preheader13, label %vector.ph + +for.body.preheader13: ; preds = %middle.block, %for.body.preheader + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %dst.addr.09.ph = phi ptr [ %dst, %for.body.preheader ], [ %monotonic.add, %middle.block ] + br label %for.body + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 2147483640 + %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %c, i64 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %monotonic.iv = phi ptr [ %dst, %vector.ph ], [ %monotonic.add, %vector.body ] + %1 = getelementptr inbounds i32, ptr %src, i64 %index + %wide.load = load <8 x i32>, ptr %1, align 4 + %2 = icmp slt <8 x i32> %wide.load, %broadcast.splat + tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %wide.load, ptr align 4 %monotonic.iv, <8 x i1> %2) + %3 = bitcast <8 x i1> %2 to i8 + %4 = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %3) + %5 = shl nuw nsw i8 %4, 2 + %6 = zext nneg i8 %5 to i64 + %monotonic.add = getelementptr inbounds i8, ptr %monotonic.iv, i64 %6 + %index.next = add nuw i64 %index, 8 + %7 = icmp eq i64 %index.next, %n.vec + br i1 %7, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13 + +for.cond.cleanup: ; preds = %for.inc, %middle.block, %entry + ret void + +for.body: ; preds = %for.body.preheader13, %for.inc + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ %indvars.iv.ph, %for.body.preheader13 ] + %dst.addr.09 = phi ptr [ %dst.addr.1, %for.inc ], [ %dst.addr.09.ph, %for.body.preheader13 ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %8 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %8, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4 + store i32 %8, ptr %dst.addr.09, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} From 99aea2176ba2327264501befbbd8dc6da4d64f94 Mon Sep 17 00:00:00 2001 From: Thirumalai Shaktivel <74826228+Thirumalai-Shaktivel@users.noreply.github.com> Date: Wed, 19 Feb 2025 14:17:20 +0530 Subject: [PATCH 025/220] [Flang][OpenMP] Skip threadprivate HostAssoc symbols for default privatization (#127754) Issue: Compilation abnormally terminates in parallel default(private) Documentation reference: A threadprivate variable must not appear as the base variable of a list item in any clause except for the copyin and copyprivate clauses Explanation: From the reference, the threadprivate symbols cannot be used in the DSA clauses, which in turn means, the symbol can be skipped for default privatization Fixes #123535 --- flang/lib/Semantics/resolve-directives.cpp | 2 +- .../test/Lower/OpenMP/threadprivate-hlfir.f90 | 54 ++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 91a1b3061e1f9..7a1dfe003e8c2 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -2309,7 +2309,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } if (Symbol * found{currScope().FindSymbol(name.source)}) { - if (found->test(semantics::Symbol::Flag::OmpThreadprivate)) + if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate)) return; } diff --git a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 index 7d02987c5eade..6201459bc42ca 100644 --- a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 @@ -15,8 +15,6 @@ !CHECK: %{{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[TP_VAL]]) fastmath : (!fir.ref, i32) -> i1 !CHECK: omp.terminator -!CHECK: fir.global internal @_QFsubEa : i32 - subroutine sub() integer, save:: a !$omp threadprivate(a) @@ -25,3 +23,55 @@ subroutine sub() !$omp end parallel end subroutine +!CHECK-LABEL: func.func @_QPsub_02() +subroutine sub_02() + integer, save :: a + !$omp threadprivate(a) + !CHECK: %[[ADDR_02:.*]] = fir.address_of(@_QFsub_02Ea) : !fir.ref + !CHECK: %[[DECL_02:.*]]:2 = hlfir.declare %[[ADDR_02]] {{{.*}} uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK: %[[TP_02:.*]] = omp.threadprivate %[[DECL_02]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_02:.*]]:2 = hlfir.declare %[[TP_02]] {{{.*}} uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + call sub_03 + !CHECK: fir.call @_QFsub_02Psub_03() fastmath : () -> () + !CHECK: return + +contains + + !CHECK-LABEL: func.func private @_QFsub_02Psub_03() + subroutine sub_03() + !CHECK: %[[ADDR_03:.*]] = fir.address_of(@_QFsub_02Ea) : !fir.ref + !CHECK: %[[DECL_03:.*]]:2 = hlfir.declare %[[ADDR_03]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK: %[[TP_03:.*]] = omp.threadprivate %[[DECL_03]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_03:.*]]:2 = hlfir.declare %[[TP_03]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !$omp parallel default(private) + !CHECK: omp.parallel + !CHECK: %[[TP_04:.*]] = omp.threadprivate %[[DECL_03]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_04:.*]]:2 = hlfir.declare %[[TP_04]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + print *, a + !CHECK: omp.terminator + !$omp end parallel + end subroutine +end subroutine + +module mod_01 + integer, save :: a + !CHECK: fir.global @_QMmod_01Ea : i32 + !$omp threadprivate(a) +end module + +!CHECK-LABEL: func.func @_QPsub_05() +subroutine sub_05() + use mod_01, only: a + !$omp parallel default(private) + !CHECK: omp.parallel { + !CHECK: %[[TP_05:.*]] = omp.threadprivate %{{.*}} : !fir.ref -> !fir.ref + !CHECK: %{{.*}} = hlfir.declare %[[TP_05]] {uniq_name = "_QMmod_01Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + print *, a + !CHECK: omp.terminator + !$omp end parallel +end subroutine + + +!CHECK: fir.global internal @_QFsubEa : i32 + +!CHECK: fir.global internal @_QFsub_02Ea : i32 From a2f9ae1421a31e987a4d0c7352137d371709f41d Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 09:56:04 +0100 Subject: [PATCH 026/220] [AMDGPU] Replace gfx940 and gfx941 with gfx942 in offload and libclc (#125826) gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 and SWDEV-512633 --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 2 +- libc/docs/gpu/using.rst | 2 +- libclc/CMakeLists.txt | 2 +- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 6 ------ offload/test/lit.cfg | 4 +--- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 8e4f47d18535d..f1f6eb57c5d6c 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,7 +98,7 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" + "gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" "gfx1152;gfx1153" diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst index 1c1f9c9bfb0c6..f17f6287be313 100644 --- a/libc/docs/gpu/using.rst +++ b/libc/docs/gpu/using.rst @@ -44,7 +44,7 @@ this shouldn't be necessary. $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc - $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc + $> clang hip.hip --offload-arch=gfx942 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc This will automatically link in the needed function definitions if they were required by the user's application. Normally using the ``-fgpu-rdc`` option diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 5cefa8a264310..05a2b87a56bc4 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -211,7 +211,7 @@ set( cayman_aliases aruba ) set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11 gfx602 gfx705 gfx805 - gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942 + gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx942 gfx1010 gfx1011 gfx1012 gfx1013 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 92184ba796dbd..e83d38a14f77f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2854,12 +2854,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error checkIfAPU() { // TODO: replace with ROCr API once it becomes available. llvm::StringRef StrGfxName(ComputeUnitKind); - IsAPU = llvm::StringSwitch(StrGfxName) - .Case("gfx940", true) - .Default(false); - if (IsAPU) - return Plugin::success(); - bool MayBeAPU = llvm::StringSwitch(StrGfxName) .Case("gfx942", true) .Default(false); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 1e265d2c30904..f017bca85dd4f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -134,12 +134,10 @@ elif config.libomptarget_current_target.startswith('amdgcn'): # amdgpu_test_arch contains a list of AMD GPUs in the system # only check the first one assuming that we will run the test on it. if not (config.amdgpu_test_arch.startswith("gfx90a") or - config.amdgpu_test_arch.startswith("gfx940") or config.amdgpu_test_arch.startswith("gfx942")): supports_unified_shared_memory = False # check if AMD architecture is an APU: - if (config.amdgpu_test_arch.startswith("gfx940") or - (config.amdgpu_test_arch.startswith("gfx942") and + if ((config.amdgpu_test_arch.startswith("gfx942") and evaluate_bool_env(config.environment['IS_APU']))): supports_apu = True if supports_unified_shared_memory: From 55fb793dc9ea8cb81169e54133d0603bc9c02840 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 09:58:56 +0100 Subject: [PATCH 027/220] [AMDGPU] Add missing gfx architectures to AddFlangOffloadRuntime.cmake (#125827) --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index f1f6eb57c5d6c..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx942;gfx1010;gfx1030" + "gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" - "gfx1152;gfx1153" + "gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" From d4cb75ef8b67864678182e7f21227f4365893578 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 19 Feb 2025 10:00:06 +0100 Subject: [PATCH 028/220] [mlir][bufferization] Module bufferization: Delete obsolete code (#127455) Delete `equivalenceAnalysis`, which has been incorporated into the `getAliasingValues` API. Also add an additional test case to ensure that equivalence is properly propagated across function boundaries. --- .../Transforms/OneShotModuleBufferize.cpp | 35 ------------------- .../one-shot-module-bufferize-analysis.mlir | 19 ++++++++++ 2 files changed, 19 insertions(+), 35 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp index 71ea0fd9d43cd..77840690e6a26 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp @@ -289,35 +289,6 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) { SymbolTable::lookupNearestSymbolFrom(callOp, sym)); } -/// Gather equivalence info of CallOps. -/// Note: This only adds new equivalence info if the called function was already -/// analyzed. -// TODO: This does not handle cyclic function call graphs etc. -static void equivalenceAnalysis(func::FuncOp funcOp, - OneShotAnalysisState &state, - FuncAnalysisState &funcState) { - funcOp->walk([&](func::CallOp callOp) { - func::FuncOp calledFunction = getCalledFunction(callOp); - assert(calledFunction && "could not retrieved called func::FuncOp"); - - // No equivalence info available for the called function. - if (!funcState.equivalentFuncArgs.count(calledFunction)) - return WalkResult::skip(); - - for (auto it : funcState.equivalentFuncArgs[calledFunction]) { - int64_t returnIdx = it.first; - int64_t bbargIdx = it.second; - if (!state.isInPlace(callOp->getOpOperand(bbargIdx))) - continue; - Value returnVal = callOp.getResult(returnIdx); - Value argVal = callOp->getOperand(bbargIdx); - state.unionEquivalenceClasses(returnVal, argVal); - } - - return WalkResult::advance(); - }); -} - /// Return "true" if the given function signature has tensor semantics. static bool hasTensorSignature(func::FuncOp funcOp) { return llvm::any_of(funcOp.getFunctionType().getInputs(), @@ -493,9 +464,6 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp, // Now analyzing function. funcState.startFunctionAnalysis(funcOp); - // Gather equivalence info for CallOps. - equivalenceAnalysis(funcOp, state, funcState); - // Analyze funcOp. if (failed(analyzeOp(funcOp, state, statistics))) return failure(); @@ -514,9 +482,6 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp, if (!state.getOptions().isOpAllowed(funcOp)) continue; - // Gather equivalence info for CallOps. - equivalenceAnalysis(funcOp, state, funcState); - // Analyze funcOp. if (failed(analyzeOp(funcOp, state, statistics))) return failure(); diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir index 2ca7f7109005c..c947407c63e74 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only dump-alias-sets" -split-input-file | FileCheck %s --check-prefix=CHECK-ALIAS // Run fuzzer with different seeds. // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null @@ -1406,3 +1407,21 @@ func.func @caller(%c: i1, %t0: tensor<5xf32>, %t1: tensor<5xf32>, %t2: tensor<5x return %r : tensor<5xf32> } +// ----- + +// CHECK-ALIAS-LABEL: func @foo +func.func @foo(%arg0: tensor) -> tensor { + // CHECK-ALIAS: return + // CHECK-ALIAS-SAME: __equivalent_func_args__ = [0] + return %arg0 : tensor +} + +// CHECK-ALIAS: func @bar(%[[arg0:.*]]: tensor +func.func @bar(%arg0: tensor) -> tensor { + // CHECK-ALIAS: %[[call:.*]] = call @foo(%[[arg0]]) + // CHECK-ALIAS-SAME: {__inplace_operands_attr__ = ["true"], __opresult_alias_set_attr__ = [{{\[}}"%[[call]]", "%[[arg0]]"]]} + %x = call @foo(%arg0) : (tensor) -> tensor + // CHECK-ALIAS: return + // CHECK-ALIAS-SAME: __equivalent_func_args__ = [0] + return %x : tensor +} From 8900e412aec04b202b9f071d110f96546989beef Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 10:05:45 +0100 Subject: [PATCH 029/220] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR (#125836) gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 +++---- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +++++++++---------- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +------ .../AMDGPUToROCDL/8-bit-floats.mlir | 2 +- mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir | 2 +- .../ArithToAMDGPU/8-bit-float-saturation.mlir | 2 +- .../ArithToAMDGPU/8-bit-floats.mlir | 2 +- .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp | 20 +++++++---------- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..f795dd89b79a1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision - operations on gfx940+. + operations on gfx94x. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 01059e42974d0..e9dcd112ce54e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx94x, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx94x. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX94x) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index c62314e504dcc..36fbdbed4ae2f 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { - if (mfma.getReducePrecision() && chipset >= kGfx940) { + if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); - if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) + if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_32x32x16_i8::getOperationName(); - if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940) + if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_16x16x32_i8::getOperationName(); } @@ -565,7 +565,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (isa(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { // Known to be correct because there are no scalar f8 instructions and // because a length mismatch will have been caught by the verifier. Type sourceBElem = @@ -585,7 +585,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (isa(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { Type sourceBElem = cast(mfma.getSourceB().getType()).getElementType(); if (m == 16 && n == 16 && k == 32 && b == 1) { @@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern { return op->emitOpError("MFMA only supported on gfx908+"); uint32_t getBlgpField = static_cast(op.getBlgp()); if (op.getNegateA() || op.getNegateB() || op.getNegateC()) { - if (chipset < kGfx940) - return op.emitOpError("negation unsupported on older than gfx940"); + if (chipset < kGfx942) + return op.emitOpError("negation unsupported on older than gfx942"); getBlgpField |= op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2); } @@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite( ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite( PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite( PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 60a002c41bfb2..b22d852f7c543 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -384,7 +384,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() { } bool convertFP8Arithmetic = - maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0); + maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2); arith::populateArithToAMDGPUConversionPatterns( patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz, *maybeChipset); diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 77f972e0e5894..7459a6503cddf 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } // gfx9 has no to a very limited support for floating-point min and max. if (chipset.majorVersion == 9) { - if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) { + if (chipset >= Chipset(9, 0, 0xa)) { // gfx90a supports f64 max (and min, but we don't have a min wrapper right // now) but all other types need to be emulated. target.addDynamicallyLegalOp( @@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp(); } - if (chipset == Chipset(9, 4, 1)) { - // gfx941 requires non-CAS atomics to be implemented with CAS loops. - // The workaround here mirrors HIP and OpenMP. - target.addIllegalOp(); - } } patterns.add< RawBufferAtomicByCasPattern, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir index 7818a525d17b5..a313aaffdf5cc 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s // CHECK-LABEL: func @ext_scalar // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8 diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir index f8a60d37801eb..52db1421dc3c6 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>, %arg2 : vector<16xf32>, %arg3 : vector<4xf32>, %arg4 : vector<4xf16>, %arg5 : vector<4xi8>, diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir index cd921da2294e1..07a428566d488 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt --split-input-file %s \ -// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \ +// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \ // RUN: | FileCheck %s // CHECK-LABEL: func.func @scalar_trunc diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir index 985fb532ea74a..6bb5b9771c015 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s +// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s // CHECK-LABEL: func.func @scalar_ext // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ) diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp index 976ff2e7382ed..570d56f3c6ff1 100644 --- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp +++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp @@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) { EXPECT_EQ(chipset->minorVersion, 0u); EXPECT_EQ(chipset->steppingVersion, 0xau); - chipset = Chipset::parse("gfx940"); + chipset = Chipset::parse("gfx942"); ASSERT_TRUE(succeeded(chipset)); EXPECT_EQ(chipset->majorVersion, 9u); EXPECT_EQ(chipset->minorVersion, 4u); - EXPECT_EQ(chipset->steppingVersion, 0u); + EXPECT_EQ(chipset->steppingVersion, 2u); chipset = Chipset::parse("gfx1103"); ASSERT_TRUE(succeeded(chipset)); @@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) { EXPECT_TRUE(failed(Chipset::parse("navi33"))); EXPECT_TRUE(failed(Chipset::parse("rdna2"))); EXPECT_TRUE(failed(Chipset::parse("sm_80"))); - EXPECT_TRUE(failed(Chipset::parse("GFX940"))); - EXPECT_TRUE(failed(Chipset::parse("Gfx940"))); + EXPECT_TRUE(failed(Chipset::parse("GFX942"))); + EXPECT_TRUE(failed(Chipset::parse("Gfx942"))); EXPECT_TRUE(failed(Chipset::parse("gfx9"))); - EXPECT_TRUE(failed(Chipset::parse("gfx_940"))); - EXPECT_TRUE(failed(Chipset::parse("gfx940_"))); + EXPECT_TRUE(failed(Chipset::parse("gfx_942"))); + EXPECT_TRUE(failed(Chipset::parse("gfx942_"))); EXPECT_TRUE(failed(Chipset::parse("gfxmeow"))); EXPECT_TRUE(failed(Chipset::parse("gfx1fff"))); } TEST(ChipsetTest, Comparison) { - EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0)); - EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2)); + EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2)); EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2)); - EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1)); EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0)); EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8)); EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa)); - EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0)); + EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2)); } } // namespace From f3f4afe43f4e0f2a07bfb53ba70507e663c9996c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 19 Feb 2025 09:07:49 +0000 Subject: [PATCH 030/220] [X86] matchUnaryShuffle - add support for matching 512-bit extension patterns. (#127643) Handles both BWI and non-BWI cases (skips PMOV*XBW without BWI). The vector-interleaved-store-i8-stride-8.ll VPTERNLOG diffs are due to better value tracking now recognizing the zero-extension patterns where before it was any-extension --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +- .../vector-interleaved-store-i8-stride-8.ll | 32 +- .../CodeGen/X86/vector-replicaton-i1-mask.ll | 360 +++++++----------- 3 files changed, 170 insertions(+), 232 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 696bb14292dd0..8fce4f29035e2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38927,13 +38927,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef Mask, } // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction. - // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || - (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()) || + (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) { unsigned MaxScale = 64 / MaskEltSize; bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize && DAG.ComputeNumSignBits(V1) == MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { + // Skip 512-bit VPMOV?XBW on non-AVX512BW targets. + if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs()) + continue; bool MatchAny = true; bool MatchZero = true; bool MatchSign = UseSign; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index ba51c65ccab13..251139161e46f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -6905,7 +6905,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -6927,7 +6927,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload @@ -6944,7 +6944,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -6968,7 +6968,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 @@ -7035,7 +7035,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero @@ -7057,7 +7057,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 @@ -7070,7 +7070,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 @@ -7083,7 +7083,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -7589,7 +7589,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -7611,7 +7611,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload @@ -7628,7 +7628,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -7652,7 +7652,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 @@ -7719,7 +7719,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero @@ -7741,7 +7741,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 @@ -7754,7 +7754,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 @@ -7767,7 +7767,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a8df418143f32..717d1e447e165 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -98,8 +98,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -110,8 +109,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -122,8 +120,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -145,8 +142,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} @@ -162,8 +158,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} @@ -176,8 +171,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 @@ -200,21 +194,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) @@ -227,21 +220,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) @@ -249,47 +241,25 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: -; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] -; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-ONLY-NEXT: vzeroupper -; AVX512BW-ONLY-NEXT: retq -; -; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: -; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vzeroupper -; AVX512VBMI-ONLY-NEXT: retq +; AVX512BW-LABEL: mask_replication_factor2_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> @@ -301,42 +271,41 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5 -; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm4 +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k4} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k5} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) @@ -351,41 +320,40 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-LABEL: mask_replication_factor2_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k5 -; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm3, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 +; AVX512DQ-NEXT: vpmovsxdq %ymm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) @@ -402,12 +370,9 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] -; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -441,8 +406,7 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2525,8 +2489,7 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 @@ -2598,47 +2561,25 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: -; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] -; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-ONLY-NEXT: vzeroupper -; AVX512BW-ONLY-NEXT: retq -; -; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: -; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vzeroupper -; AVX512VBMI-ONLY-NEXT: retq +; AVX512BW-LABEL: mask_replication_factor4_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> @@ -2747,11 +2688,9 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2785,8 +2724,7 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2997,8 +2935,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -3060,8 +2997,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -12956,8 +12892,7 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY: # %bb.0: ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} @@ -13083,10 +13018,10 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -13291,13 +13226,12 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k3 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 ; AVX512BW-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -13680,16 +13614,16 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -13710,9 +13644,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm11, %k1 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 @@ -13735,8 +13669,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm10, %k2 +; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 @@ -13765,7 +13699,7 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) @@ -13775,9 +13709,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 512(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) From 029c8e783d17d55541b308ee6db5429d54cb5153 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 10:11:48 +0100 Subject: [PATCH 031/220] [AMDGPU][clang] Replace gfx940 and gfx941 with gfx942 in clang (#126762) gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all occurrences of gfx940/gfx941 from clang that can be removed without changes in the llvm directory. The target-invalid-cpu-note/amdgcn.c test is not included here since it tests a list of targets that is defined in llvm/lib/TargetParser/TargetParser.cpp. For SWDEV-512631 --- clang/include/clang/Basic/Cuda.h | 2 - clang/lib/Basic/Cuda.cpp | 2 - clang/lib/Basic/Targets/NVPTX.cpp | 2 - clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 - clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu | 2 +- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 - .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl | 2 +- ...cn-gfx940.cl => builtins-amdgcn-gfx942.cl} | 2 +- .../builtins-amdgcn-gfx950-err.cl | 2 +- .../builtins-amdgcn-gws-insts.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-mfma.cl | 110 +++++++++--------- ...fx940.cl => builtins-fp-atomics-gfx942.cl} | 34 +++--- clang/test/Driver/amdgpu-macros.cl | 2 - clang/test/Driver/amdgpu-mcpu.cl | 4 - clang/test/Driver/cuda-bad-arch.cu | 2 +- clang/test/Driver/hip-macros.hip | 10 +- .../test/Misc/target-invalid-cpu-note/nvptx.c | 2 - ... => builtins-amdgcn-error-gfx942-param.cl} | 2 +- .../builtins-amdgcn-error-gfx950.cl | 2 +- ...0-err.cl => builtins-amdgcn-gfx942-err.cl} | 14 +-- 20 files changed, 91 insertions(+), 113 deletions(-) rename clang/test/CodeGenOpenCL/{builtins-amdgcn-gfx940.cl => builtins-amdgcn-gfx942.cl} (98%) rename clang/test/CodeGenOpenCL/{builtins-fp-atomics-gfx940.cl => builtins-fp-atomics-gfx942.cl} (84%) rename clang/test/SemaOpenCL/{builtins-amdgcn-error-gfx940-param.cl => builtins-amdgcn-error-gfx942-param.cl} (99%) rename clang/test/SemaOpenCL/{builtins-amdgcn-gfx940-err.cl => builtins-amdgcn-gfx942-err.cl} (81%) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, - GFX(940), // gfx940 - GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-returns -munsafe-fp-atomics \ // RUN: | FileCheck -check-prefix=UNSAFE %s diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 633f1dec5e370..d12dcead6fadf 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -29,8 +29,6 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s -// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s -// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s @@ -85,8 +83,6 @@ // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" -// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index 6593a8de566f6..f300b05fe798a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl similarity index 98% rename from clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl index a2f14c652c828..789f6e07240d7 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // REQUIRES: amdgpu-registered-target typedef unsigned int u32; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl index 521121f5e7e54..c91cf158948b9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl @@ -2,7 +2,7 @@ // RUN: -verify -o - %s // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -emit-llvm \ // RUN: -verify -o - %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 -emit-llvm \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 -emit-llvm \ // RUN: -verify -o - %s // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm \ // RUN: -verify -o - %s diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl index 45d2fa18efd53..b3367202f824e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90c -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index 00346baa6ff84..79083c3c5f0f9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -1,7 +1,7 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -DMFMA_GFX942_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX942 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950 #pragma OPENCL EXTENSION cl_khr_fp64:enable @@ -226,189 +226,189 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c) #endif // MFMA_GFX90A_TESTS -#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) -// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8 -// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) +#if defined(MFMA_GFX942_TESTS) || defined(MFMA_GFX950_TESTS) +// CHECK-GFX942-LABEL: @test_mfma_i32_16x16x32_i8 +// CHECK-GFX942: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c) { *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8 -// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_i32_32x32x16_i8 +// CHECK-GFX942: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c) { *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x8_xf32 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x4_xf32 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x32_f16 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x16_f16 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x32_bf16 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x16_bf16 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8 -// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_i32_16x16x64_i8 +// CHECK-GFX942: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx) { *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8 -// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_i32_32x32x32_i8 +// CHECK-GFX942: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx) { *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0); } -#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) +#endif // defined(MFMA_GFX942_TESTS) || defined(MFMA_GFX950_TESTS) #ifdef MFMA_GFX950_TESTS diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl similarity index 84% rename from clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl index 832d7df00db14..24d05fe3a8525 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl @@ -1,8 +1,8 @@ -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ // RUN: %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ -// RUN: -S -o - %s | FileCheck -check-prefix=GFX940 %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX942 %s // REQUIRES: amdgpu-registered-target @@ -12,8 +12,8 @@ typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: test_flat_add_f32 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} -// GFX940-LABEL: test_flat_add_f32 -// GFX940: flat_atomic_add_f32 +// GFX942-LABEL: test_flat_add_f32 +// GFX942: flat_atomic_add_f32 half2 test_flat_add_f32(__generic float *addr, float x) { return __builtin_amdgcn_flat_atomic_fadd_f32(addr, x); } @@ -21,8 +21,8 @@ half2 test_flat_add_f32(__generic float *addr, float x) { // CHECK-LABEL: test_flat_add_2f16 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} -// GFX940-LABEL: test_flat_add_2f16 -// GFX940: flat_atomic_pk_add_f16 +// GFX942-LABEL: test_flat_add_2f16 +// GFX942: flat_atomic_pk_add_f16 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); } @@ -32,8 +32,8 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_flat_add_2bf16 -// GFX940: flat_atomic_pk_add_bf16 +// GFX942-LABEL: test_flat_add_2bf16 +// GFX942: flat_atomic_pk_add_bf16 short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); } @@ -43,8 +43,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_global_add_2bf16 -// GFX940: global_atomic_pk_add_bf16 +// GFX942-LABEL: test_global_add_2bf16 +// GFX942: global_atomic_pk_add_bf16 short2 test_global_add_2bf16(__global short2 *addr, short2 x) { return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); } @@ -55,24 +55,24 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_local_add_2bf16 -// GFX940: ds_pk_add_rtn_bf16 +// GFX942-LABEL: test_local_add_2bf16 +// GFX942: ds_pk_add_rtn_bf16 short2 test_local_add_2bf16(__local short2 *addr, short2 x) { return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); } // CHECK-LABEL: test_local_add_2f16 // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 -// GFX940-LABEL: test_local_add_2f16 -// GFX940: ds_pk_add_rtn_f16 +// GFX942-LABEL: test_local_add_2f16 +// GFX942: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); } // CHECK-LABEL: test_local_add_2f16_noret // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 -// GFX940-LABEL: test_local_add_2f16_noret -// GFX940: ds_pk_add_f16 +// GFX942-LABEL: test_local_add_2f16_noret +// GFX942: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); } diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index d97b2ddb1fc66..35dc190761ca4 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -107,8 +107,6 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c -DFAMILY=GFX9 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index 7c34d3ec6c63a..ad5fd8ebaa6a6 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -92,8 +92,6 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefix=GFX909 %s // RUN: %clang -### -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A %s // RUN: %clang -### -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefix=GFX90C %s -// RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s -// RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s // RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s @@ -148,8 +146,6 @@ // GFX909: "-target-cpu" "gfx909" // GFX90A: "-target-cpu" "gfx90a" // GFX90C: "-target-cpu" "gfx90c" -// GFX940: "-target-cpu" "gfx940" -// GFX941: "-target-cpu" "gfx941" // GFX942: "-target-cpu" "gfx942" // GFX950: "-target-cpu" "gfx950" // GFX1010: "-target-cpu" "gfx1010" diff --git a/clang/test/Driver/cuda-bad-arch.cu b/clang/test/Driver/cuda-bad-arch.cu index 8c8c5c3401329..85231a5b9705a 100644 --- a/clang/test/Driver/cuda-bad-arch.cu +++ b/clang/test/Driver/cuda-bad-arch.cu @@ -23,7 +23,7 @@ // RUN: | FileCheck -check-prefix OK %s // RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx90a -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s -// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx940 -c %s 2>&1 \ +// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx942 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s // We don't allow using NVPTX/AMDGCN for host compilation. diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip index 3b3afba0b18ca..bd93f9985a774 100644 --- a/clang/test/Driver/hip-macros.hip +++ b/clang/test/Driver/hip-macros.hip @@ -49,15 +49,13 @@ // RUN: %s 2>&1 | FileCheck --check-prefixes=IMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx1100 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=IMAGE,NOWARN %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s -// RUN: %clang -E -dM --offload-arch=gfx941 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx1100 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -Xclang -target-feature -Xclang "-image-insts" %s 2>&1 | FileCheck --check-prefixes=IMAGE,WARN %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -Xclang -target-feature -Xclang "+image-insts" %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,WARN %s // NOWARN-NOT: warning // WARN: warning: feature flag '{{[+|-]}}image-insts' is ignored since the feature is read only [-Winvalid-command-line-argument] @@ -68,9 +66,9 @@ // RUN: %clang -E -dM --offload-arch=gfx1100 -nogpuinc -nogpulib \ // RUN: -fgpu-default-stream=per-thread %s 2>&1 | FileCheck --check-prefixes=PTS %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -fgpu-default-stream=legacy %s 2>&1 | FileCheck --check-prefixes=NOPTS %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOPTS %s // PTS-DAG: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__ 1 // PTS-DAG: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__ 1 diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c index 3afcdf8c9fe5c..06ef72878340f 100644 --- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c +++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c @@ -52,8 +52,6 @@ // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c // CHECK-SAME: {{^}}, gfx9-4-generic -// CHECK-SAME: {{^}}, gfx940 -// CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 // CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx10-1-generic diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl similarity index 99% rename from clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl index 0fc2304d51ce0..db0387e9878f2 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify -S -o - %s // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -verify -S -o - %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl index d1c134c604dfc..b40b1c841b453 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify -S -o - %s typedef float float4 __attribute__((ext_vector_type(4))); typedef float float16 __attribute__((ext_vector_type(16))); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl similarity index 81% rename from clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl index 7cf80f7c92677..0b3f692f33998 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify=gfx940,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx942,expected -o - %s // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s // REQUIRES: amdgpu-registered-target @@ -8,12 +8,12 @@ void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx942-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx942-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} } __attribute__((target("gfx950-insts"))) From 4af8c5382e12a74a3c79c573e029d681719a323b Mon Sep 17 00:00:00 2001 From: Pedro Lobo Date: Wed, 19 Feb 2025 09:13:37 +0000 Subject: [PATCH 032/220] [BitcodeReader] Use `poison` instead of `undef` to represent unsupported constexprs in metadata (#127665) Metadata that references unsupported constant expressions can be represented with `poison` metadata instead of `undef` metadata. --- llvm/lib/Bitcode/Reader/MetadataLoader.cpp | 4 ++-- llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll | 4 ++-- llvm/test/Bitcode/constexpr-to-instr-metadata.ll | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 413d9f68e6cc3..f8f5432c73a0f 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1229,13 +1229,13 @@ static Value *getValueFwdRef(BitcodeReaderValueList &ValueList, unsigned Idx, // This is a reference to a no longer supported constant expression. // Pretend that the constant was deleted, which will replace metadata - // references with undef. + // references with poison. // TODO: This is a rather indirect check. It would be more elegant to use // a separate ErrorInfo for constant materialization failure and thread // the error reporting through getValueFwdRef(). if (Idx < ValueList.size() && ValueList[Idx] && ValueList[Idx]->getType() == Ty) - return UndefValue::get(Ty); + return PoisonValue::get(Ty); return nullptr; } diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll index 72f092adf5054..77b78fb4bd4f8 100644 --- a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll +++ b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll @@ -1,7 +1,7 @@ ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata-2.bc | FileCheck %s ; CHECK-LABEL: define void @_ZN4alsa3pcm3PCM17hw_params_current17hf1c237aece2f69c4E() { -; CHECK: #dbg_value(ptr undef, !4, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !14 +; CHECK: #dbg_value(ptr poison, !4, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !14 ; CHECK-LABEL: define void @_ZN4alsa3pcm8HwParams3any17h02a64cfc85ce8a66E() { -; CHECK: #dbg_value(ptr undef, !23, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !28 +; CHECK: #dbg_value(ptr poison, !23, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !28 diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll index ecc39a86c6327..84b1a8f5ba45d 100644 --- a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll +++ b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll @@ -1,4 +1,4 @@ ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata.bc | FileCheck %s ; CHECK-LABEL: define void @test() { -; CHECK: #dbg_value(i64 undef, !4, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !13 +; CHECK: #dbg_value(i64 poison, !4, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !13 From cc539138acf742f53ef455147182a8de11980a02 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 19 Feb 2025 10:16:57 +0100 Subject: [PATCH 033/220] [CodeGen] Use __extendhfsf2 and __truncsfhf2 by default (#126880) The standard libcalls for half to float and float to half conversion are __extendhfsf2 and __truncsfhf2. However, LLVM currently uses __gnu_h2f_ieee and __gnu_f2h_ieee instead. As far as I can tell, these libcalls are an ARM-ism and only provided by libgcc on that platform. compiler-rt always provides both libcalls. Use the standard libcalls by default, and only use the __gnu libcalls on ARM. --- .../wasm/lto/Inputs/libcall-return-addr.ll | 6 + .../wasm/lto/Inputs/libcall-truncsfhf2.ll | 6 - lld/test/wasm/lto/libcall-return-addr.ll | 18 + lld/test/wasm/lto/libcall-truncsfhf2.ll | 20 - llvm/include/llvm/IR/RuntimeLibcalls.def | 4 +- llvm/lib/IR/RuntimeLibcalls.cpp | 3 - llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 + .../Target/Hexagon/HexagonISelLowering.cpp | 5 - llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 - .../WebAssembly/WebAssemblyISelLowering.cpp | 5 - .../WebAssemblyRuntimeLibcallSignatures.cpp | 4 - llvm/lib/Target/X86/X86ISelLowering.cpp | 3 - .../16bit-float-promotion-with-nofp.ll | 2 +- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 24 +- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 24 +- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 24 +- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 24 +- .../AArch64/strictfp_f16_abi_promote.ll | 40 +- llvm/test/CodeGen/LoongArch/fp16-promote.ll | 92 +-- llvm/test/CodeGen/Mips/fp16-promote.ll | 96 +-- llvm/test/CodeGen/Mips/ldexp.ll | 4 +- llvm/test/CodeGen/PowerPC/atomics.ll | 8 +- .../PowerPC/handle-f16-storage-type.ll | 168 ++-- llvm/test/CodeGen/PowerPC/pr48519.ll | 14 +- llvm/test/CodeGen/PowerPC/pr49092.ll | 2 +- llvm/test/CodeGen/PowerPC/vector-llrint.ll | 756 +++++++++--------- llvm/test/CodeGen/PowerPC/vector-lrint.ll | 756 +++++++++--------- llvm/test/CodeGen/SPARC/fp16-promote.ll | 76 +- .../VE/Scalar/fp_extload_truncstore.ll | 28 +- llvm/test/CodeGen/X86/cvt16.ll | 10 +- llvm/test/CodeGen/X86/fmf-flags.ll | 6 +- llvm/test/CodeGen/X86/fp-i129.ll | 4 +- llvm/test/CodeGen/X86/fp128-cast-strict.ll | 4 +- llvm/test/CodeGen/X86/fptosi-sat-scalar.ll | 20 +- llvm/test/CodeGen/X86/fptoui-sat-scalar.ll | 20 +- llvm/test/CodeGen/X86/frem.ll | 2 +- llvm/test/CodeGen/X86/half-constrained.ll | 14 +- llvm/test/CodeGen/X86/ldexp.ll | 4 +- llvm/test/CodeGen/X86/llvm.frexp.ll | 10 +- .../GlobalISel/LegalizerHelperTest.cpp | 4 +- 40 files changed, 1147 insertions(+), 1169 deletions(-) create mode 100644 lld/test/wasm/lto/Inputs/libcall-return-addr.ll delete mode 100644 lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll create mode 100644 lld/test/wasm/lto/libcall-return-addr.ll delete mode 100644 lld/test/wasm/lto/libcall-truncsfhf2.ll diff --git a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll new file mode 100644 index 0000000000000..271bdae11e49d --- /dev/null +++ b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll @@ -0,0 +1,6 @@ +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-emscripten" + +define ptr @emscripten_return_address() { + ret ptr null +} diff --git a/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll b/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll deleted file mode 100644 index 1439d7f8b4cb4..0000000000000 --- a/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll +++ /dev/null @@ -1,6 +0,0 @@ -target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" -target triple = "wasm32-unknown-unknown" - -define half @__truncsfhf2(float) { - ret half 0.0 -} diff --git a/lld/test/wasm/lto/libcall-return-addr.ll b/lld/test/wasm/lto/libcall-return-addr.ll new file mode 100644 index 0000000000000..74eba74f97018 --- /dev/null +++ b/lld/test/wasm/lto/libcall-return-addr.ll @@ -0,0 +1,18 @@ +; RUN: llvm-as %s -o %t.o +; RUN: llvm-as %p/Inputs/libcall-return-addr.ll -o %t.return-addr.o +; RUN: rm -f %t.a +; RUN: llvm-ar rcs %t.a %t.return-addr.o +; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-emscripten" + +@g_ptr = global ptr null + +define void @_start() { + %addr = call ptr @llvm.returnaddress(i32 1) + store ptr %addr, ptr @g_ptr + ret void +} + +; CHECK: wasm-ld: error: {{.*}}return-addr.o): attempt to add bitcode file after LTO (emscripten_return_address) diff --git a/lld/test/wasm/lto/libcall-truncsfhf2.ll b/lld/test/wasm/lto/libcall-truncsfhf2.ll deleted file mode 100644 index fd07bb53890f6..0000000000000 --- a/lld/test/wasm/lto/libcall-truncsfhf2.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llvm-as %s -o %t.o -; RUN: llvm-as %p/Inputs/libcall-truncsfhf2.ll -o %t.truncsfhf2.o -; RUN: rm -f %t.a -; RUN: llvm-ar rcs %t.a %t.truncsfhf2.o -; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s - -target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" -target triple = "wasm32-unknown-unknown" - -@g_float = global float 0.0 -@g_half = global half 0.0 - -define void @_start() { - %val1 = load float, ptr @g_float - %v0 = fptrunc float %val1 to half - store half %v0, ptr @g_half - ret void -} - -; CHECK: wasm-ld: error: {{.*}}truncsfhf2.o): attempt to add bitcode file after LTO (__truncsfhf2) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index a7963543c4350..c6ac341d71a20 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -384,8 +384,8 @@ HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2") HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2") HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2") HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2") -HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee") -HANDLE_LIBCALL(FPROUND_F32_F16, "__gnu_f2h_ieee") +HANDLE_LIBCALL(FPEXT_F16_F32, "__extendhfsf2") +HANDLE_LIBCALL(FPROUND_F32_F16, "__truncsfhf2") HANDLE_LIBCALL(FPROUND_F64_F16, "__truncdfhf2") HANDLE_LIBCALL(FPROUND_F80_F16, "__truncxfhf2") HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2") diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index e38fce764b640..1f94400f7c088 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -170,9 +170,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { // TODO: BridgeOS should be included in isOSDarwin. setLibcallName(RTLIB::EXP10_F32, "__exp10f"); setLibcallName(RTLIB::EXP10_F64, "__exp10"); - } else { - setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); } if (TT.isGNUEnvironment() || TT.isOSFuchsia() || diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index eb1491feb611e..c7ed73d0e95f7 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -767,6 +767,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } + } else if (!Subtarget->isTargetMachO()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); } if (Subtarget->isThumb1Only()) diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index d66e3e306d2ff..1710488e4e292 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1886,11 +1886,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf"); else setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf"); - - // Routines to handle fp16 storage type. - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - setLibcallName(RTLIB::FPROUND_F64_F16, "__truncdfhf2"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); } const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 28cc136d76ffc..0f5e7bd254f68 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1549,9 +1549,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.useRVVForFixedLengthVectors()) setTargetDAGCombine(ISD::BITCAST); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Disable strict node mutation. IsStrictFPEnabled = true; EnableExtLdPromotion = true; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4fc79b3d6e3f8..da4ef677440fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -377,11 +377,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setMaxAtomicSizeInBitsSupported(64); - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is - // consistent with the f64 and f128 names. - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Define the emscripten name for return address helper. // TODO: when implementing other Wasm backends, make this generic or only do // this on emscripten depending on what they end up doing. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index b20a06b238c88..1fe0b1f2e0591 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -537,10 +537,6 @@ struct StaticLibcallNameMap { Map[NameLibcall.first] = NameLibcall.second; } } - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is - // consistent with the f64 and f128 names. - Map["__extendhfsf2"] = RTLIB::FPEXT_F16_F32; - Map["__truncsfhf2"] = RTLIB::FPROUND_F32_F16; Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8fce4f29035e2..dc25af9e2f1d5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -736,9 +736,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll index bfe9ab8424bb0..0bd7c1b10b123 100644 --- a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll +++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll @@ -7,7 +7,7 @@ define half @f2h(float %a) { ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 0c3a40d93d640..21729b9dfd101 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -60,13 +60,13 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -148,13 +148,13 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -712,22 +712,22 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 24088998f36d1..9b5e48d2b4217 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -62,13 +62,13 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -150,13 +150,13 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -592,22 +592,22 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 65f1f4863c173..f6c542fe7d407 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -62,13 +62,13 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -150,13 +150,13 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -592,22 +592,22 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 0f1a2f03c98c3..82e0f14e68e26 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -60,13 +60,13 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -148,13 +148,13 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -712,22 +712,22 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll index 3db802a2bc355..63b8a1cee27ae 100644 --- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -22,7 +22,7 @@ define void @f16_arg(half %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w30, -16 ; NOFP16-NEXT: and w0, w0, #0xffff ; NOFP16-NEXT: mov x19, x1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: str w0, [x19] ; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload ; NOFP16-NEXT: ret @@ -44,10 +44,10 @@ define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: and w0, w0, #0xffff ; NOFP16-NEXT: mov x19, x2 ; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: stp w21, w0, [x19] ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -73,14 +73,14 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: and w0, w1, #0xffff ; NOFP16-NEXT: mov x19, x3 ; NOFP16-NEXT: mov w20, w2 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w8, w0 ; NOFP16-NEXT: and w0, w20, #0xffff ; NOFP16-NEXT: orr x21, x8, x22, lsl #32 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: str x21, [x19] ; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: str w0, [x19, #8] @@ -110,16 +110,16 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w20, w3 ; NOFP16-NEXT: mov w21, w2 ; NOFP16-NEXT: mov w22, w1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w23, w0 ; NOFP16-NEXT: and w0, w22, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: stp w21, w0, [x19, #8] ; NOFP16-NEXT: stp w23, w22, [x19] ; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload @@ -137,7 +137,7 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; NOFP16-NEXT: .cfi_def_cfa_offset 16 ; NOFP16-NEXT: .cfi_offset w30, -16 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; NOFP16-NEXT: ret %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -155,10 +155,10 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w30, -32 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w20 ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload @@ -180,13 +180,13 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w2 ; NOFP16-NEXT: mov w19, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w19 ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: mov w2, w21 @@ -212,16 +212,16 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w0, w3 ; NOFP16-NEXT: mov w19, w2 ; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w21 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w20 ; NOFP16-NEXT: mov w2, w19 ; NOFP16-NEXT: mov w3, w22 diff --git a/llvm/test/CodeGen/LoongArch/fp16-promote.ll b/llvm/test/CodeGen/LoongArch/fp16-promote.ll index c5a27a7011278..3701f0df1d2b2 100644 --- a/llvm/test/CodeGen/LoongArch/fp16-promote.ll +++ b/llvm/test/CodeGen/LoongArch/fp16-promote.ll @@ -23,12 +23,12 @@ define float @test_fpextend_float(ptr %p) nounwind { ; LA32-LABEL: test_fpextend_float: ; LA32: # %bb.0: ; LA32-NEXT: ld.hu $a0, $a0, 0 -; LA32-NEXT: b %plt(__gnu_h2f_ieee) +; LA32-NEXT: b %plt(__extendhfsf2) ; ; LA64-LABEL: test_fpextend_float: ; LA64: # %bb.0: ; LA64-NEXT: ld.hu $a0, $a0, 0 -; LA64-NEXT: b %plt(__gnu_h2f_ieee) +; LA64-NEXT: b %plt(__extendhfsf2) %a = load half, ptr %p %r = fpext half %a to float ret float %r @@ -40,7 +40,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: ld.hu $a0, $a0, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fcvt.d.s $fa0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 @@ -51,7 +51,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: ld.hu $a0, $a0, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fcvt.d.s $fa0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 @@ -68,7 +68,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill ; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -81,7 +81,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload @@ -132,12 +132,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind { ; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: move $a0, $a1 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $fp -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -152,12 +152,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind { ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: move $a0, $a1 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $fp -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload @@ -178,12 +178,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind { ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: ld.hu $s0, $a0, 0 ; LA32-NEXT: ld.hu $a0, $a1, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $s0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload @@ -202,12 +202,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind { ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: ld.hu $s0, $a0, 0 ; LA64-NEXT: ld.hu $a0, $a1, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $s0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload @@ -231,12 +231,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind { ; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: move $a0, $a1 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $fp -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -251,12 +251,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind { ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: move $a0, $a1 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $fp -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload @@ -277,12 +277,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind { ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: ld.hu $s0, $a0, 0 ; LA32-NEXT: ld.hu $a0, $a1, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $s0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload @@ -301,12 +301,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind { ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: ld.hu $s0, $a0, 0 ; LA64-NEXT: ld.hu $a0, $a1, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $s0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload @@ -327,10 +327,10 @@ define half @freeze_half_undef() nounwind { ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: movgr2fr.w $fa0, $zero -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -340,10 +340,10 @@ define half @freeze_half_undef() nounwind { ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: movgr2fr.w $fa0, $zero -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 ; LA64-NEXT: ret @@ -357,9 +357,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind { ; LA32: # %bb.0: ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -368,9 +368,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind { ; LA64: # %bb.0: ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 ; LA64-NEXT: ret @@ -384,7 +384,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: ftintrz.w.s $fa0, $fa0 ; LA32-NEXT: movfr2gr.s $a0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -395,7 +395,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.w.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.s $a0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload @@ -411,7 +411,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: ftintrz.w.s $fa0, $fa0 ; LA32-NEXT: movfr2gr.s $a0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -422,7 +422,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.w.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.s $a0, $fa0 ; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0 @@ -439,7 +439,7 @@ define i64 @test_half_to_i64(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: bl %plt(__fixsfdi) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 @@ -449,7 +449,7 @@ define i64 @test_half_to_i64(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.l.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/fp16-promote.ll b/llvm/test/CodeGen/Mips/fp16-promote.ll index 47bace9f5c03f..c03ca3a6d78dd 100644 --- a/llvm/test/CodeGen/Mips/fp16-promote.ll +++ b/llvm/test/CodeGen/Mips/fp16-promote.ll @@ -11,12 +11,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $16, $4 ; MIPS32-NEXT: lhu $4, 0($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f0, $f20 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload @@ -33,12 +33,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; MIPS64-NEXT: sd $16, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $4 ; MIPS64-NEXT: lhu $4, 0($5) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f0, $f24 ; MIPS64-NEXT: sh $2, 0($16) ; MIPS64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload @@ -59,7 +59,7 @@ define float @test_fpext_float(ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: lhu $4, 0($4) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: jr $ra @@ -70,7 +70,7 @@ define float @test_fpext_float(ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: lhu $4, 0($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; MIPS64-NEXT: jr $ra @@ -86,7 +86,7 @@ define double @test_fpext_double(ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: lhu $4, 0($4) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -98,7 +98,7 @@ define double @test_fpext_double(ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: lhu $4, 0($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload @@ -115,7 +115,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload @@ -128,7 +128,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: move $16, $5 ; MIPS64-NEXT: sh $2, 0($16) ; MIPS64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload @@ -180,18 +180,18 @@ define <4 x float> @test_vec_fpext_float(ptr %p) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $17, $4 ; MIPS32-NEXT: lhu $4, 6($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: lhu $4, 4($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: swc1 $f0, 12($17) ; MIPS32-NEXT: swc1 $f0, 8($17) ; MIPS32-NEXT: lhu $4, 2($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: swc1 $f0, 4($17) ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: swc1 $f0, 0($17) ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload @@ -209,21 +209,21 @@ define <4 x float> @test_vec_fpext_float(ptr %p) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $4 ; MIPS64-NEXT: lhu $4, 2($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: lhu $4, 6($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mfc1 $17, $f0 ; MIPS64-NEXT: mfc1 $18, $f0 ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: dsll $17, $17, 32 ; MIPS64-NEXT: mfc1 $1, $f0 ; MIPS64-NEXT: dsll $1, $1, 32 ; MIPS64-NEXT: dsrl $1, $1, 32 ; MIPS64-NEXT: or $17, $1, $17 ; MIPS64-NEXT: lhu $4, 4($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: dsll $18, $18, 32 ; MIPS64-NEXT: mfc1 $1, $f0 ; MIPS64-NEXT: dsll $1, $1, 32 @@ -251,21 +251,21 @@ define <4 x double> @test_vec_fpext_double(ptr %p) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $17, $4 ; MIPS32-NEXT: lhu $4, 6($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: lhu $4, 4($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: mov.s $f20, $f0 ; MIPS32-NEXT: lhu $4, 2($16) ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: cvt.d.s $f2, $f20 ; MIPS32-NEXT: sdc1 $f2, 24($17) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: sdc1 $f0, 16($17) ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: sdc1 $f0, 8($17) ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: sdc1 $f0, 0($17) @@ -285,21 +285,21 @@ define <4 x double> @test_vec_fpext_double(ptr %p) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $17, $4 ; MIPS64-NEXT: lhu $4, 6($5) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: move $16, $5 ; MIPS64-NEXT: lhu $4, 4($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mov.s $f24, $f0 ; MIPS64-NEXT: lhu $4, 2($16) ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: cvt.d.s $f1, $f24 ; MIPS64-NEXT: sdc1 $f1, 24($17) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sdc1 $f0, 16($17) ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: sdc1 $f0, 8($17) ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: sdc1 $f0, 0($17) @@ -326,18 +326,18 @@ define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind { ; MIPS32-NEXT: move $16, $7 ; MIPS32-NEXT: move $17, $5 ; MIPS32-NEXT: move $18, $4 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $6, $f12 ; MIPS32-NEXT: move $19, $2 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $16, $f12 ; MIPS32-NEXT: mtc1 $17, $f12 ; MIPS32-NEXT: lw $16, 56($sp) ; MIPS32-NEXT: sh $2, 6($16) -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: sh $19, 4($16) ; MIPS32-NEXT: sh $2, 2($16) -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $18, $f12 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload @@ -360,22 +360,22 @@ define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind { ; MIPS64-NEXT: move $17, $5 ; MIPS64-NEXT: move $18, $4 ; MIPS64-NEXT: sll $1, $18, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: move $19, $2 ; MIPS64-NEXT: sll $1, $17, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: dsrl $1, $17, 32 ; MIPS64-NEXT: sll $1, $1, 0 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: sh $2, 4($16) -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: sh $19, 0($16) ; MIPS64-NEXT: sh $2, 6($16) ; MIPS64-NEXT: dsrl $1, $18, 32 ; MIPS64-NEXT: sll $1, $1, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: sh $2, 2($16) ; MIPS64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload @@ -484,19 +484,19 @@ define half @test_fadd_fadd(half %a, half %b, half %c) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $16, $6 ; MIPS32-NEXT: move $17, $4 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $5 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $17 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f0, $f20 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $2 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $16 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f20, $f0 ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $17, 24($sp) # 4-byte Folded Reload @@ -514,19 +514,19 @@ define half @test_fadd_fadd(half %a, half %b, half %c) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $6 ; MIPS64-NEXT: move $17, $4 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $5, 0 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $17, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f0, $f24 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $2, 0 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $16, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f24, $f0 ; MIPS64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; MIPS64-NEXT: ld $17, 8($sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/ldexp.ll b/llvm/test/CodeGen/Mips/ldexp.ll index 3753fd567a3ed..4debc6ddce4aa 100644 --- a/llvm/test/CodeGen/Mips/ldexp.ll +++ b/llvm/test/CodeGen/Mips/ldexp.ll @@ -128,12 +128,12 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; SOFT-NEXT: .cfi_offset 31, -4 ; SOFT-NEXT: .cfi_offset 16, -8 ; SOFT-NEXT: move $16, $5 -; SOFT-NEXT: jal __gnu_h2f_ieee +; SOFT-NEXT: jal __extendhfsf2 ; SOFT-NEXT: andi $4, $4, 65535 ; SOFT-NEXT: move $4, $2 ; SOFT-NEXT: jal ldexpf ; SOFT-NEXT: move $5, $16 -; SOFT-NEXT: jal __gnu_f2h_ieee +; SOFT-NEXT: jal __truncsfhf2 ; SOFT-NEXT: move $4, $2 ; SOFT-NEXT: lw $16, 16($sp) # 4-byte Folded Reload ; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 07bdbb25a746a..24e71c87414e8 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -476,7 +476,7 @@ define half @load_atomic_f16__seq_cst(ptr %ptr) { ; PPC32-NEXT: cmpw cr7, r3, r3 ; PPC32-NEXT: bne- cr7, .+4 ; PPC32-NEXT: isync -; PPC32-NEXT: bl __gnu_h2f_ieee +; PPC32-NEXT: bl __extendhfsf2 ; PPC32-NEXT: lwz r0, 20(r1) ; PPC32-NEXT: addi r1, r1, 16 ; PPC32-NEXT: mtlr r0 @@ -494,7 +494,7 @@ define half @load_atomic_f16__seq_cst(ptr %ptr) { ; PPC64-NEXT: cmpd cr7, r3, r3 ; PPC64-NEXT: bne- cr7, .+4 ; PPC64-NEXT: isync -; PPC64-NEXT: bl __gnu_h2f_ieee +; PPC64-NEXT: bl __extendhfsf2 ; PPC64-NEXT: nop ; PPC64-NEXT: addi r1, r1, 112 ; PPC64-NEXT: ld r0, 16(r1) @@ -582,7 +582,7 @@ define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { ; PPC32-NEXT: .cfi_offset r30, -8 ; PPC32-NEXT: stw r30, 8(r1) # 4-byte Folded Spill ; PPC32-NEXT: mr r30, r3 -; PPC32-NEXT: bl __gnu_f2h_ieee +; PPC32-NEXT: bl __truncsfhf2 ; PPC32-NEXT: sync ; PPC32-NEXT: sth r3, 0(r30) ; PPC32-NEXT: lwz r30, 8(r1) # 4-byte Folded Reload @@ -601,7 +601,7 @@ define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { ; PPC64-NEXT: .cfi_offset r30, -16 ; PPC64-NEXT: std r30, 112(r1) # 8-byte Folded Spill ; PPC64-NEXT: mr r30, r3 -; PPC64-NEXT: bl __gnu_f2h_ieee +; PPC64-NEXT: bl __truncsfhf2 ; PPC64-NEXT: nop ; PPC64-NEXT: sync ; PPC64-NEXT: sth r3, 0(r30) diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll index 4256933300243..50f05cca80458 100644 --- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll +++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll @@ -17,7 +17,7 @@ define dso_local double @loadd(ptr nocapture readonly %a) local_unnamed_addr #0 ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 2(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -37,7 +37,7 @@ define dso_local double @loadd(ptr nocapture readonly %a) local_unnamed_addr #0 ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 2(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -61,7 +61,7 @@ define dso_local float @loadf(ptr nocapture readonly %a) local_unnamed_addr #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 2(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -81,7 +81,7 @@ define dso_local float @loadf(ptr nocapture readonly %a) local_unnamed_addr #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 2(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) @@ -130,9 +130,9 @@ define dso_local void @stored(ptr nocapture %a, double %b) local_unnamed_addr #0 ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -156,7 +156,7 @@ define dso_local void @storef(ptr nocapture %a, float %b) local_unnamed_addr #0 ; P8-NEXT: stdu r1, -48(r1) ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r3 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -179,12 +179,12 @@ define dso_local void @storef(ptr nocapture %a, float %b) local_unnamed_addr #0 ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: clrldi r3, r4, 32 ; SOFT-NEXT: std r0, 64(r1) -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -220,9 +220,9 @@ define void @test_load_store(ptr %in, ptr %out) #0 { ; SOFT-NEXT: std r0, 64(r1) ; SOFT-NEXT: mr r30, r4 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -279,7 +279,7 @@ define float @test_extend32(ptr %addr) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -298,7 +298,7 @@ define float @test_extend32(ptr %addr) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) @@ -315,7 +315,7 @@ define double @test_extend64(ptr %addr) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -334,7 +334,7 @@ define double @test_extend64(ptr %addr) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -354,7 +354,7 @@ define void @test_trunc32(float %in, ptr %addr) #0 { ; P8-NEXT: stdu r1, -48(r1) ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -377,12 +377,12 @@ define void @test_trunc32(float %in, ptr %addr) #0 { ; SOFT-NEXT: clrldi r3, r3, 32 ; SOFT-NEXT: std r0, 64(r1) ; SOFT-NEXT: mr r30, r4 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -427,9 +427,9 @@ define void @test_trunc64(double %in, ptr %addr) #0 { ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -448,7 +448,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xscvdpsxds f0, f1 ; P8-NEXT: mffprd r3, f0 @@ -472,7 +472,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __fixsfdi ; SOFT-NEXT: nop @@ -494,7 +494,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 ; P8-NEXT: xscvsxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -522,12 +522,12 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; SOFT-NEXT: bl __floatdisf ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -546,7 +546,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xscvdpuxds f0, f1 ; P8-NEXT: mffprd r3, f0 @@ -570,7 +570,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __fixunssfdi ; SOFT-NEXT: nop @@ -592,7 +592,7 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 ; P8-NEXT: xscvuxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -619,12 +619,12 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; SOFT-NEXT: mr r30, r4 ; SOFT-NEXT: bl __floatundisf ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -651,19 +651,19 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 { ; P8-NEXT: stxvd2x vs62, r1, r4 # 16-byte Folded Spill ; P8-NEXT: li r4, 80 ; P8-NEXT: stxvd2x vs63, r1, r4 # 16-byte Folded Spill -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 2(r30) ; P8-NEXT: xxlor vs63, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 4(r30) ; P8-NEXT: xxlor vs62, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 0(r30) ; P8-NEXT: xxlor vs61, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: li r3, 80 ; P8-NEXT: xxmrghd vs0, vs61, vs1 @@ -714,19 +714,19 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 { ; SOFT-NEXT: std r0, 96(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: lhz r3, 2(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: lhz r3, 4(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: lhz r3, 6(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r6, r3 ; SOFT-NEXT: mr r3, r29 @@ -759,19 +759,19 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 { ; P8-NEXT: stxvd2x vs62, r1, r4 # 16-byte Folded Spill ; P8-NEXT: li r4, 80 ; P8-NEXT: stxvd2x vs63, r1, r4 # 16-byte Folded Spill -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 4(r30) ; P8-NEXT: xxlor vs63, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 2(r30) ; P8-NEXT: xxlor vs62, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 0(r30) ; P8-NEXT: xxlor vs61, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: li r3, 80 ; P8-NEXT: xxmrghd vs35, vs63, vs62 @@ -816,25 +816,25 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 { ; SOFT-NEXT: std r0, 96(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: lhz r3, 2(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: lhz r3, 4(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: lhz r3, 6(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -870,21 +870,21 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; P8-NEXT: stxvd2x vs63, r1, r3 # 16-byte Folded Spill ; P8-NEXT: mr r30, r5 ; P8-NEXT: vmr v31, v2 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xxswapd vs0, vs63 ; P8-NEXT: mr r29, r3 ; P8-NEXT: xscvspdpn f1, vs0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xxsldwi vs0, vs63, vs63, 1 ; P8-NEXT: mr r28, r3 ; P8-NEXT: xscvspdpn f1, vs0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xscvspdpn f1, vs63 ; P8-NEXT: mr r27, r3 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 6(r30) ; P8-NEXT: li r3, 48 @@ -939,48 +939,48 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; SOFT-NEXT: mr r30, r7 ; SOFT-NEXT: mr r29, r5 ; SOFT-NEXT: mr r28, r4 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r26, r3 ; SOFT-NEXT: clrldi r3, r29, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r28, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r27, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: clrldi r3, r28, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r29, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r26, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 6(r30) ; SOFT-NEXT: mr r3, r29 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 4(r30) ; SOFT-NEXT: mr r3, r28 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 2(r30) ; SOFT-NEXT: mr r3, r27 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 80 @@ -1093,33 +1093,33 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 { ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: clrldi r3, r28, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r29, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r26, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 6(r30) ; SOFT-NEXT: mr r3, r29 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 4(r30) ; SOFT-NEXT: mr r3, r28 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 2(r30) ; SOFT-NEXT: mr r3, r27 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 80 @@ -1145,15 +1145,15 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; P8-NEXT: std r0, 80(r1) ; P8-NEXT: mr r30, r3 ; P8-NEXT: lhz r3, 0(r4) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: mtfprwa f0, r30 ; P8-NEXT: fmr f31, f1 ; P8-NEXT: xscvsxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: clrldi r3, r3, 48 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xsaddsp f1, f31, f1 ; P8-NEXT: addi r1, r1, 64 @@ -1187,17 +1187,17 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; SOFT-NEXT: std r0, 80(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r4) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: extsw r3, r30 ; SOFT-NEXT: bl __floatsisf ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r4, r3 ; SOFT-NEXT: mr r3, r29 @@ -1221,10 +1221,10 @@ define half @PR40273(half) #0 { ; P8-NEXT: mflr r0 ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: clrldi r3, r3, 48 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: fmr f0, f1 ; P8-NEXT: xxlxor f1, f1, f1 @@ -1260,7 +1260,7 @@ define half @PR40273(half) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: clrldi r3, r3, 48 ; SOFT-NEXT: std r0, 48(r1) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: li r4, 0 ; SOFT-NEXT: bl __nesf2 @@ -1268,7 +1268,7 @@ define half @PR40273(half) #0 { ; SOFT-NEXT: cmplwi r3, 0 ; SOFT-NEXT: lis r3, 16256 ; SOFT-NEXT: iseleq r3, 0, r3 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/pr48519.ll b/llvm/test/CodeGen/PowerPC/pr48519.ll index 002dd8f0d167a..fa156454a1313 100644 --- a/llvm/test/CodeGen/PowerPC/pr48519.ll +++ b/llvm/test/CodeGen/PowerPC/pr48519.ll @@ -20,17 +20,17 @@ define void @julia__typed_vcat_20() #0 { ; CHECK-NEXT: addi r3, r3, -1 ; CHECK-NEXT: mtfprd f0, r3 ; CHECK-NEXT: xscvsxdsp f1, f0 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r30, r30, -1 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: cmpldi r30, 0 ; CHECK-NEXT: bc 12, gt, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %bb11 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: sth r3, 0(r3) ; @@ -95,7 +95,7 @@ define void @julia__hypot_17() #0 { ; CHECK-NEXT: # %bb.2: # %bb3 ; CHECK-NEXT: # ; CHECK-NEXT: lhz r3, 0(0) -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fcmpu cr0, f1, f1 ; CHECK-NEXT: bun cr0, .LBB1_1 @@ -169,12 +169,12 @@ define void @func_48786() #0 { ; CHECK-NEXT: # %bb.3: # %bb4 ; CHECK-NEXT: # ; CHECK-NEXT: lhz r3, 0(r3) -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bc 4, 4*cr2+lt, .LBB2_6 ; CHECK-NEXT: # %bb.4: # %bb8 ; CHECK-NEXT: # -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: sth r3, 0(0) ; CHECK-NEXT: b .LBB2_1 @@ -273,7 +273,7 @@ define void @func_48785(half %arg) #0 { ; CHECK-NEXT: .LBB3_1: # %bb1 ; CHECK-NEXT: # ; CHECK-NEXT: fmr f1, f31 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r30, r30, -1 ; CHECK-NEXT: sth r3, 0(r29) diff --git a/llvm/test/CodeGen/PowerPC/pr49092.ll b/llvm/test/CodeGen/PowerPC/pr49092.ll index ea84c77603d08..7b524a6d2f69b 100644 --- a/llvm/test/CodeGen/PowerPC/pr49092.ll +++ b/llvm/test/CodeGen/PowerPC/pr49092.ll @@ -14,7 +14,7 @@ define dso_local half @test2(i64 %a, i64 %b) local_unnamed_addr #0 { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: addi r3, r3, 11 ; CHECK-NEXT: clrlwi r3, r3, 16 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r1, r1, 32 ; CHECK-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll index 190cf6fe1eaad..9229fefced67e 100644 --- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll +++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll @@ -17,10 +17,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; BE-NEXT: std r0, 128(r1) ; BE-NEXT: .cfi_def_cfa_offset 112 ; BE-NEXT: .cfi_offset lr, 16 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -36,10 +36,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -55,10 +55,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; FAST-NEXT: std r0, 48(r1) ; FAST-NEXT: .cfi_def_cfa_offset 32 ; FAST-NEXT: .cfi_offset lr, 16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: mffprd r3, f0 @@ -85,18 +85,18 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; BE-NEXT: fmr f31, f1 ; BE-NEXT: fmr f1, f2 ; BE-NEXT: std r30, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -129,18 +129,18 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; CHECK-NEXT: stfd f31, 88(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f2 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -172,17 +172,17 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f2 ; FAST-NEXT: std r0, 64(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: fctid f1, f30 @@ -226,34 +226,34 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; BE-NEXT: stfd f31, 200(r1) # 8-byte Folded Spill ; BE-NEXT: fmr f31, f4 ; BE-NEXT: fmr f30, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -313,34 +313,34 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; CHECK-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f4 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -394,31 +394,31 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; FAST-NEXT: std r0, 80(r1) ; FAST-NEXT: fmr f31, f3 ; FAST-NEXT: fmr f30, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f30 ; FAST-NEXT: fctid f2, f31 @@ -491,66 +491,66 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; BE-NEXT: fmr f28, f5 ; BE-NEXT: fmr f27, f4 ; BE-NEXT: fmr f26, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -664,66 +664,66 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; CHECK-NEXT: stfd f31, 232(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f8 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -821,59 +821,59 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; FAST-NEXT: fmr f27, f4 ; FAST-NEXT: fmr f26, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f26 @@ -1001,130 +1001,130 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; BE-NEXT: fmr f23, f5 ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 652(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 668(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 660(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: fmr f24, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f23, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f22, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f21, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f20, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f19, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -1343,130 +1343,130 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill ; CHECK-NEXT: li r3, 160 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 568(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 576(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 584(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r16, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r17, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -1650,115 +1650,115 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; FAST-NEXT: fmr f22, f4 ; FAST-NEXT: fmr f23, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 304(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 296(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: fmr f1, f21 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f19 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: fmr f1, f18 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: fmr f1, f17 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: fmr f1, f20 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: fmr f1, f22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: fmr f1, f23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f23 @@ -1935,272 +1935,272 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: std r3, 304(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: std r3, 296(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: std r3, 280(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: std r3, 264(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: std r3, 248(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: std r3, 232(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: std r3, 216(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: std r3, 200(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: std r3, 184(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: std r3, 168(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: std r3, 152(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1028(r1) ; BE-NEXT: std r3, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1044(r1) ; BE-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1036(r1) ; BE-NEXT: mr r15, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1060(r1) ; BE-NEXT: mr r14, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1052(r1) ; BE-NEXT: mr r31, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1076(r1) ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1068(r1) ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1092(r1) ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1084(r1) ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1108(r1) ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1100(r1) ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1124(r1) ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1116(r1) ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1140(r1) ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1132(r1) ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1156(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1148(r1) ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1172(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1164(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: stfs f1, 316(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: stfs f1, 312(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: stfs f1, 292(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: stfs f1, 276(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: stfs f1, 260(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: stfs f1, 244(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: stfs f1, 228(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: stfs f1, 212(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: stfs f1, 196(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: stfs f1, 180(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: stfs f1, 164(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: stfs f1, 148(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: stfs f1, 132(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r31, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r14, 48 ; BE-NEXT: fmr f16, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r15, 48 ; BE-NEXT: fmr f15, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f14, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f31, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 136(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f30, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 152(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f29, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 168(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f28, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 184(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f27, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 200(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f26, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 216(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f25, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 232(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f24, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 248(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f23, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 264(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f22, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 280(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f21, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 296(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f20, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 304(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f19, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -2561,274 +2561,274 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r4 # 16-byte Folded Spill ; CHECK-NEXT: li r4, 384 ; CHECK-NEXT: stvx v31, r1, r4 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: std r3, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: std r3, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: std r3, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: std r3, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: std r3, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: std r3, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: std r3, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: std r3, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: std r3, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 832(r1) ; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 840(r1) ; CHECK-NEXT: std r3, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 848(r1) ; CHECK-NEXT: mr r15, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 856(r1) ; CHECK-NEXT: mr r14, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 864(r1) ; CHECK-NEXT: mr r31, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 872(r1) ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 880(r1) ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 888(r1) ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 896(r1) ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 904(r1) ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 912(r1) ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 920(r1) ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 928(r1) ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 936(r1) ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 944(r1) ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 952(r1) ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 960(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 968(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 976(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 204 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r16, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 200 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r17, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r31, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r14, 48 ; CHECK-NEXT: fmr f16, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r15, 48 ; CHECK-NEXT: fmr f15, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 48(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f14, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 56(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f30, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 64(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v30, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 72(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v29, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 80(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v28, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 88(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v27, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 96(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v26, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 104(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v25, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v24, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v23, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 128(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v22, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 144(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v21, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 160(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v20, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 176(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f31, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -3200,238 +3200,238 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; FAST-NEXT: xxlor v31, f6, f6 ; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill ; FAST-NEXT: lfs f1, 768(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 120 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 760(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 112 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 752(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 104 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 744(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 96 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 736(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 88 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 728(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 80 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 720(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 72 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 712(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 64 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 704(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 56 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 696(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 48 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 688(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v21, f1, f1 ; FAST-NEXT: lfs f1, 680(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v20, f1, f1 ; FAST-NEXT: lfs f1, 672(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v24, f1, f1 ; FAST-NEXT: lfs f1, 664(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 656(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 648(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: lfs f1, 640(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: lfs f1, 632(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: lfs f1, 624(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: xxlor f1, v25, v25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: xxlor f1, v26, v26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: xxlor f1, v27, v27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: xxlor f1, v28, v28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: xxlor f1, v29, v29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: xxlor f1, v30, v30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: xxlor f1, v31, v31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f14 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f14, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: xxlor f1, v22, v22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: xxlor f1, v23, v23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 44 ; FAST-NEXT: fmr f15, f1 ; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f3, f15 ; FAST-NEXT: fctid f4, f17 diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll index b6d0bd5c05894..c2576d4631db8 100644 --- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll +++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll @@ -28,10 +28,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; BE-NEXT: std r0, 128(r1) ; BE-NEXT: .cfi_def_cfa_offset 112 ; BE-NEXT: .cfi_offset lr, 16 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -47,10 +47,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -66,10 +66,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; FAST-NEXT: std r0, 48(r1) ; FAST-NEXT: .cfi_def_cfa_offset 32 ; FAST-NEXT: .cfi_offset lr, 16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: mffprd r3, f0 @@ -96,18 +96,18 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; BE-NEXT: fmr f31, f1 ; BE-NEXT: fmr f1, f2 ; BE-NEXT: std r30, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -140,18 +140,18 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; CHECK-NEXT: stfd f31, 88(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f2 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -183,17 +183,17 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f2 ; FAST-NEXT: std r0, 64(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: fctid f1, f30 @@ -237,34 +237,34 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; BE-NEXT: stfd f31, 200(r1) # 8-byte Folded Spill ; BE-NEXT: fmr f31, f4 ; BE-NEXT: fmr f30, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -324,34 +324,34 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; CHECK-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f4 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -405,31 +405,31 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; FAST-NEXT: std r0, 80(r1) ; FAST-NEXT: fmr f31, f3 ; FAST-NEXT: fmr f30, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f30 ; FAST-NEXT: fctid f2, f31 @@ -502,66 +502,66 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; BE-NEXT: fmr f28, f5 ; BE-NEXT: fmr f27, f4 ; BE-NEXT: fmr f26, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -675,66 +675,66 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; CHECK-NEXT: stfd f31, 232(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f8 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -832,59 +832,59 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; FAST-NEXT: fmr f27, f4 ; FAST-NEXT: fmr f26, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f26 @@ -1012,130 +1012,130 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; BE-NEXT: fmr f23, f5 ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 652(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 668(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 660(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: fmr f24, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f23, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f22, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f21, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f20, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f19, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -1354,130 +1354,130 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill ; CHECK-NEXT: li r3, 160 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 568(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 576(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 584(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r16, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r17, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -1661,115 +1661,115 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; FAST-NEXT: fmr f22, f4 ; FAST-NEXT: fmr f23, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 304(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 296(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: fmr f1, f21 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f19 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: fmr f1, f18 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: fmr f1, f17 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: fmr f1, f20 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: fmr f1, f22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: fmr f1, f23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f23 @@ -1946,272 +1946,272 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: std r3, 304(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: std r3, 296(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: std r3, 280(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: std r3, 264(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: std r3, 248(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: std r3, 232(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: std r3, 216(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: std r3, 200(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: std r3, 184(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: std r3, 168(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: std r3, 152(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1028(r1) ; BE-NEXT: std r3, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1044(r1) ; BE-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1036(r1) ; BE-NEXT: mr r15, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1060(r1) ; BE-NEXT: mr r14, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1052(r1) ; BE-NEXT: mr r31, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1076(r1) ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1068(r1) ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1092(r1) ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1084(r1) ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1108(r1) ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1100(r1) ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1124(r1) ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1116(r1) ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1140(r1) ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1132(r1) ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1156(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1148(r1) ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1172(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1164(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: stfs f1, 316(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: stfs f1, 312(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: stfs f1, 292(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: stfs f1, 276(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: stfs f1, 260(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: stfs f1, 244(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: stfs f1, 228(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: stfs f1, 212(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: stfs f1, 196(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: stfs f1, 180(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: stfs f1, 164(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: stfs f1, 148(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: stfs f1, 132(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r31, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r14, 48 ; BE-NEXT: fmr f16, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r15, 48 ; BE-NEXT: fmr f15, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f14, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f31, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 136(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f30, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 152(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f29, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 168(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f28, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 184(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f27, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 200(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f26, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 216(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f25, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 232(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f24, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 248(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f23, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 264(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f22, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 280(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f21, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 296(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f20, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 304(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f19, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -2572,274 +2572,274 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r4 # 16-byte Folded Spill ; CHECK-NEXT: li r4, 384 ; CHECK-NEXT: stvx v31, r1, r4 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: std r3, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: std r3, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: std r3, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: std r3, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: std r3, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: std r3, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: std r3, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: std r3, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: std r3, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 832(r1) ; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 840(r1) ; CHECK-NEXT: std r3, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 848(r1) ; CHECK-NEXT: mr r15, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 856(r1) ; CHECK-NEXT: mr r14, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 864(r1) ; CHECK-NEXT: mr r31, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 872(r1) ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 880(r1) ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 888(r1) ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 896(r1) ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 904(r1) ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 912(r1) ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 920(r1) ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 928(r1) ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 936(r1) ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 944(r1) ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 952(r1) ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 960(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 968(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 976(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 204 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r16, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 200 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r17, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r31, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r14, 48 ; CHECK-NEXT: fmr f16, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r15, 48 ; CHECK-NEXT: fmr f15, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 48(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f14, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 56(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f30, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 64(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v30, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 72(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v29, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 80(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v28, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 88(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v27, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 96(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v26, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 104(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v25, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v24, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v23, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 128(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v22, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 144(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v21, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 160(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v20, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 176(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f31, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -3211,238 +3211,238 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; FAST-NEXT: xxlor v31, f6, f6 ; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill ; FAST-NEXT: lfs f1, 768(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 120 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 760(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 112 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 752(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 104 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 744(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 96 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 736(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 88 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 728(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 80 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 720(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 72 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 712(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 64 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 704(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 56 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 696(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 48 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 688(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v21, f1, f1 ; FAST-NEXT: lfs f1, 680(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v20, f1, f1 ; FAST-NEXT: lfs f1, 672(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v24, f1, f1 ; FAST-NEXT: lfs f1, 664(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 656(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 648(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: lfs f1, 640(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: lfs f1, 632(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: lfs f1, 624(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: xxlor f1, v25, v25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: xxlor f1, v26, v26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: xxlor f1, v27, v27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: xxlor f1, v28, v28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: xxlor f1, v29, v29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: xxlor f1, v30, v30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: xxlor f1, v31, v31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f14 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f14, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: xxlor f1, v22, v22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: xxlor f1, v23, v23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 44 ; FAST-NEXT: fmr f15, f1 ; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f3, f15 ; FAST-NEXT: fctid f4, f17 diff --git a/llvm/test/CodeGen/SPARC/fp16-promote.ll b/llvm/test/CodeGen/SPARC/fp16-promote.ll index a15104c7b8cff..efe67b04e8fb3 100644 --- a/llvm/test/CodeGen/SPARC/fp16-promote.ll +++ b/llvm/test/CodeGen/SPARC/fp16-promote.ll @@ -20,7 +20,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; V8-LABEL: test_fpextend_float: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __gnu_h2f_ieee +; V8-NEXT: call __extendhfsf2 ; V8-NEXT: lduh [%i0], %o0 ; V8-NEXT: ret ; V8-NEXT: restore @@ -28,7 +28,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; V9-LABEL: test_fpextend_float: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: ret ; V9-NEXT: restore @@ -36,7 +36,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; SPARC64-LABEL: test_fpextend_float: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore @@ -49,7 +49,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; V8-LABEL: test_fpextend_double: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __gnu_h2f_ieee +; V8-NEXT: call __extendhfsf2 ; V8-NEXT: lduh [%i0], %o0 ; V8-NEXT: fstod %f0, %f0 ; V8-NEXT: ret @@ -58,7 +58,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; V9-LABEL: test_fpextend_double: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: fstod %f0, %f0 ; V9-NEXT: ret @@ -67,7 +67,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; SPARC64-LABEL: test_fpextend_double: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: fstod %f0, %f0 ; SPARC64-NEXT: ret @@ -81,7 +81,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V8-OPT-LABEL: test_fpextend_fp128: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -112, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-20] ; V8-OPT-NEXT: add %fp, -16, %i0 @@ -99,7 +99,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V8-UNOPT-LABEL: test_fpextend_fp128: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -112, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-20] ; V8-UNOPT-NEXT: add %fp, -16, %i0 @@ -125,7 +125,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V9-LABEL: test_fpextend_fp128: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -112, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-20] ; V9-NEXT: add %fp, -16, %i0 @@ -143,7 +143,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; SPARC64-LABEL: test_fpextend_fp128: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: add %fp, 2031, %o0 ; SPARC64-NEXT: fmovs %f0, %f3 @@ -165,7 +165,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V8-OPT-LABEL: test_fptrunc_float: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -96, %sp -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: mov %i0, %o0 ; V8-OPT-NEXT: sth %o0, [%i1] ; V8-OPT-NEXT: ret @@ -176,7 +176,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V8-UNOPT-NEXT: save %sp, -96, %sp ; V8-UNOPT-NEXT: mov %i0, %o0 ; V8-UNOPT-NEXT: st %o0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %f0 ; V8-UNOPT-NEXT: sth %o0, [%i1] ; V8-UNOPT-NEXT: ret @@ -185,7 +185,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V9-LABEL: test_fptrunc_float: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: mov %i0, %o0 ; V9-NEXT: sth %o0, [%i1] ; V9-NEXT: ret @@ -194,7 +194,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; SPARC64-LABEL: test_fptrunc_float: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: nop ; SPARC64-NEXT: sth %o0, [%i1] ; SPARC64-NEXT: ret @@ -329,15 +329,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fadd: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-OPT-NEXT: fadds %f1, %f0, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 ; V8-OPT-NEXT: sth %o0, [%i0] ; V8-OPT-NEXT: ret @@ -346,16 +346,16 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fadd: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 ; V8-UNOPT-NEXT: fmovs %f0, %f1 ; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fadds %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %o0 ; V8-UNOPT-NEXT: sth %o0, [%i0] ; V8-UNOPT-NEXT: ret @@ -364,15 +364,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fadd: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V9-NEXT: fadds %f1, %f0, %f0 ; V9-NEXT: st %f0, [%fp+-4] -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 ; V9-NEXT: sth %o0, [%i0] ; V9-NEXT: ret @@ -381,13 +381,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fadd: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fadds %f1, %f0, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret @@ -403,15 +403,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fmul: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-OPT-NEXT: fmuls %f1, %f0, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 ; V8-OPT-NEXT: sth %o0, [%i0] ; V8-OPT-NEXT: ret @@ -420,16 +420,16 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fmul: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 ; V8-UNOPT-NEXT: fmovs %f0, %f1 ; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fmuls %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %o0 ; V8-UNOPT-NEXT: sth %o0, [%i0] ; V8-UNOPT-NEXT: ret @@ -438,15 +438,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fmul: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V9-NEXT: fmuls %f1, %f0, %f0 ; V9-NEXT: st %f0, [%fp+-4] -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 ; V9-NEXT: sth %o0, [%i0] ; V9-NEXT: ret @@ -455,13 +455,13 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fmul: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fmuls %f1, %f0, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret diff --git a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll index 4e30778d5c158..f105966bc4d08 100644 --- a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll +++ b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll @@ -26,9 +26,9 @@ define float @func_i16fp32(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 ; CHECK-NEXT: ld %s10, 8(, %s11) @@ -58,9 +58,9 @@ define double @func_i16fp64(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -91,9 +91,9 @@ define float @func_fp16fp32(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 ; CHECK-NEXT: ld %s10, 8(, %s11) @@ -123,9 +123,9 @@ define double @func_fp16fp64(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -157,9 +157,9 @@ define void @func_fp32i16(ptr %fl.ptr, float %val) { ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: or %s18, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_f2h_ieee@lo +; CHECK-NEXT: lea %s0, __truncsfhf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_f2h_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __truncsfhf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: st2b %s0, (, %s18) @@ -194,15 +194,15 @@ define half @func_fp32fp16(ptr %fl.ptr, float %a) { ; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: st %s19, 296(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: or %s18, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_f2h_ieee@lo +; CHECK-NEXT: lea %s0, __truncsfhf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_f2h_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __truncsfhf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s19, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s0, __extendhfsf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s19 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: st2b %s19, (, %s18) diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll index c6c088297c0ea..db615c8065d03 100644 --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -41,7 +41,7 @@ define void @test1(float %src, ptr %dest) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rbx ; SOFTFLOAT-NEXT: movq %rsi, %rbx -; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT +; SOFTFLOAT-NEXT: callq __truncsfhf2@PLT ; SOFTFLOAT-NEXT: movw %ax, (%rbx) ; SOFTFLOAT-NEXT: popq %rbx ; SOFTFLOAT-NEXT: retq @@ -66,7 +66,7 @@ define float @test2(ptr nocapture %src) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: popq %rcx ; SOFTFLOAT-NEXT: retq %1 = load i16, ptr %src, align 2 @@ -94,9 +94,9 @@ define float @test3(float %src) nounwind uwtable readnone { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 -; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT +; SOFTFLOAT-NEXT: callq __truncsfhf2@PLT ; SOFTFLOAT-NEXT: movzwl %ax, %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: popq %rcx ; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq @@ -126,7 +126,7 @@ define double @test4(ptr nocapture %src) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: movl %eax, %edi ; SOFTFLOAT-NEXT: callq __extendsfdf2@PLT ; SOFTFLOAT-NEXT: popq %rcx diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll index 24dabfc18b9e3..16ebf70126f8b 100644 --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -124,13 +124,13 @@ define dso_local float @div_arcp_by_const(half %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; X86-NEXT: fstps (%esp) -; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: calll __truncsfhf2 ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: popl %eax ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fp-i129.ll b/llvm/test/CodeGen/X86/fp-i129.ll index 97116ce4e621f..c55c19abbd9b8 100644 --- a/llvm/test/CodeGen/X86/fp-i129.ll +++ b/llvm/test/CodeGen/X86/fp-i129.ll @@ -96,7 +96,7 @@ define i257 @fptosi257_double(double %a) nounwind { ; half tests define i257 @fptosi_half(half %a) nounwind { ; X86-LABEL: fptosi_half: -; X86: __gnu_h2f_ieee +; X86: __extendhfsf2 ; ; X64-LABEL: fptosi_half: ; X64: __extendhfsf2 @@ -106,7 +106,7 @@ define i257 @fptosi_half(half %a) nounwind { define half @uitofp_half(i257 %a) nounwind { ; X86-LABEL: uitofp_half: -; X86: __gnu_f2h_ieee +; X86: __truncsfhf2 ; ; X64-LABEL: uitofp_half: ; X64: __truncsfhf2 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll index f141153d059ac..707b05f3478db 100644 --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -28,7 +28,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp { ; X64-AVX512-LABEL: TestFPExtF16_F128: ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: pushq %rax -; X64-AVX512-NEXT: vmovsh vf16(%rip), %xmm0 +; X64-AVX512-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: callq __extendhftf2@PLT ; X64-AVX512-NEXT: vmovaps %xmm0, vf128(%rip) ; X64-AVX512-NEXT: popq %rax @@ -40,7 +40,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp { ; X86-NEXT: subl $40, %esp ; X86-NEXT: movzwl vf16, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: wait ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll index 04fce7badb951..85f4c945230e1 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -2060,7 +2060,7 @@ define i1 @test_signed_i1_f16(half %f) nounwind { ; X86-X87-NEXT: subl $24, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2158,7 +2158,7 @@ define i8 @test_signed_i8_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2253,7 +2253,7 @@ define i13 @test_signed_i13_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2349,7 +2349,7 @@ define i16 @test_signed_i16_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2445,7 +2445,7 @@ define i19 @test_signed_i19_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2538,7 +2538,7 @@ define i32 @test_signed_i32_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2633,7 +2633,7 @@ define i50 @test_signed_i50_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2758,7 +2758,7 @@ define i64 @test_signed_i64_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2885,7 +2885,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) @@ -3064,7 +3064,7 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll index fefc92c313511..47dc3ca3616ea 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -1883,7 +1883,7 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1965,7 +1965,7 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2047,7 +2047,7 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2128,7 +2128,7 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2209,7 +2209,7 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind { ; X86-X87-NEXT: subl $28, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2295,7 +2295,7 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind { ; X86-X87-NEXT: subl $28, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2382,7 +2382,7 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind { ; X86-X87-NEXT: subl $24, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fxch %st(1) ; X86-X87-NEXT: fucom %st(1) @@ -2526,7 +2526,7 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fxch %st(1) ; X86-X87-NEXT: fucom %st(1) @@ -2667,7 +2667,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: subl $44, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) @@ -2821,7 +2821,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index 35d16c3bac70d..959265d08299a 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -82,7 +82,7 @@ define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: callq fmodf128 +; CHECK-NEXT: callq fmodf128@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rbx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll index 0f73129d984bd..f1874cc03000a 100644 --- a/llvm/test/CodeGen/X86/half-constrained.ll +++ b/llvm/test/CodeGen/X86/half-constrained.ll @@ -15,7 +15,7 @@ define float @half_to_float() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -64,7 +64,7 @@ define double @half_to_double() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -116,7 +116,7 @@ define x86_fp80 @half_to_fp80() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -166,7 +166,7 @@ define void @float_to_half(float %0) strictfp { ; X86-NOF16C-NEXT: flds {{[0-9]+}}(%esp) ; X86-NOF16C-NEXT: fstps (%esp) ; X86-NOF16C-NEXT: wait -; X86-NOF16C-NEXT: calll __gnu_f2h_ieee +; X86-NOF16C-NEXT: calll __truncsfhf2 ; X86-NOF16C-NEXT: movw %ax, a ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 @@ -324,17 +324,17 @@ define void @add() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NOF16C-NEXT: wait ; X86-NOF16C-NEXT: movzwl b, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NOF16C-NEXT: faddp %st, %st(1) ; X86-NOF16C-NEXT: fstps (%esp) ; X86-NOF16C-NEXT: wait -; X86-NOF16C-NEXT: calll __gnu_f2h_ieee +; X86-NOF16C-NEXT: calll __truncsfhf2 ; X86-NOF16C-NEXT: movw %ax, c ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll index 3c6e14598571d..859139463b7e3 100644 --- a/llvm/test/CodeGen/X86/ldexp.ll +++ b/llvm/test/CodeGen/X86/ldexp.ll @@ -608,14 +608,14 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) ; WIN32-NEXT: calll _ldexp ; WIN32-NEXT: fstps {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: addl $16, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll index 96de34519556d..8436c1052552e 100644 --- a/llvm/test/CodeGen/X86/llvm.frexp.ll +++ b/llvm/test/CodeGen/X86/llvm.frexp.ll @@ -45,7 +45,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) @@ -54,7 +54,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: movl %esi, %edx ; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: popl %esi @@ -95,7 +95,7 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) @@ -103,7 +103,7 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; WIN32-NEXT: fstps {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: retl %result = call { half, i32 } @llvm.frexp.f16.i32(half %a) @@ -146,7 +146,7 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; WIN32-NEXT: subl $16, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index 0932938b209a4..22181ce33f0da 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -2060,7 +2060,7 @@ TEST_F(AArch64GISelMITest, LibcallFPExt) { auto CheckStr = R"( CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC CHECK: $h0 = COPY [[TRUNC]] - CHECK: BL &__gnu_h2f_ieee + CHECK: BL &__extendhfsf2 CHECK: $d0 = COPY CHECK: BL &__extenddftf2 )"; @@ -2103,7 +2103,7 @@ TEST_F(AArch64GISelMITest, LibcallFPTrunc) { auto CheckStr = R"( CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC CHECK: $s0 = COPY [[TRUNC]] - CHECK: BL &__gnu_f2h_ieee + CHECK: BL &__truncsfhf2 CHECK: $q0 = COPY CHECK: BL &__trunctfdf2 )"; From 8615f9aaffd4337a33ea979f010c4d6410ba6125 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 10:20:48 +0100 Subject: [PATCH 034/220] [AMDGPU] Replace gfx940 and gfx941 with gfx942 in llvm (#126763) gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all non-documentation occurrences of gfx940/gfx941 from the llvm directory, and the remaining occurrences in clang. Documentation changes will follow. For SWDEV-512631 --- .../Misc/target-invalid-cpu-note/amdgcn.c | 2 - llvm/docs/AMDGPUUsage.rst | 4 +- llvm/include/llvm/BinaryFormat/ELF.h | 4 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 56 +++++++++---------- llvm/include/llvm/TargetParser/TargetParser.h | 2 - llvm/lib/Object/ELFObjectFile.cpp | 4 -- llvm/lib/ObjectYAML/ELFYAML.cpp | 2 - llvm/lib/Target/AMDGPU/AMDGPU.td | 22 -------- .../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +- llvm/lib/Target/AMDGPU/DSInstructions.td | 2 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 4 +- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 +- llvm/lib/Target/AMDGPU/GCNProcessors.td | 14 +---- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +- .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 4 -- llvm/lib/Target/AMDGPU/SIDefines.h | 2 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 +++---- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 1 - llvm/lib/Target/AMDGPU/SISchedule.td | 6 +- .../Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp | 2 +- llvm/lib/TargetParser/TargetParser.cpp | 6 -- .../preload-implicit-kernargs-debug-info.ll | 2 +- llvm/tools/llvm-readobj/ELFDumper.cpp | 2 - 23 files changed, 60 insertions(+), 113 deletions(-) diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c index 642d2df211c21..9ef44b2bb403e 100644 --- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c +++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c @@ -45,8 +45,6 @@ // CHECK-SAME: {{^}}, gfx909 // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c -// CHECK-SAME: {{^}}, gfx940 -// CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 // CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx1010 diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 5966d1617feee..936e8e2960bf1 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -2232,7 +2232,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX1035`` 0x03d ``gfx1035`` ``EF_AMDGPU_MACH_AMDGCN_GFX1034`` 0x03e ``gfx1034`` ``EF_AMDGPU_MACH_AMDGCN_GFX90A`` 0x03f ``gfx90a`` - ``EF_AMDGPU_MACH_AMDGCN_GFX940`` 0x040 ``gfx940`` + *reserved* 0x040 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1100`` 0x041 ``gfx1100`` ``EF_AMDGPU_MACH_AMDGCN_GFX1013`` 0x042 ``gfx1013`` ``EF_AMDGPU_MACH_AMDGCN_GFX1150`` 0x043 ``gfx1150`` @@ -2243,7 +2243,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX1200`` 0x048 ``gfx1200`` *reserved* 0x049 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1151`` 0x04a ``gfx1151`` - ``EF_AMDGPU_MACH_AMDGCN_GFX941`` 0x04b ``gfx941`` + *reserved* 0x04b Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942`` *reserved* 0x04d Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201`` diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 64f643749d6ac..37eab89e706db 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -814,7 +814,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, - EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040, EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, @@ -825,7 +825,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, - EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4B = 0x04b, EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9558f2b9b74e0..1e4f25c642493 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1074,7 +1074,7 @@ class AMDGPUImageDimIntrinsic.DmaskArgIndex>>]), @@ -1321,7 +1321,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // Note: volatile bit is **not** permitted here. @@ -1351,7 +1351,7 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1381,7 +1381,7 @@ class AMDGPURawPtrBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1413,7 +1413,7 @@ class AMDGPUStructBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1431,7 +1431,7 @@ class AMDGPUStructAtomicBufferLoad : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1448,7 +1448,7 @@ class AMDGPUStructPtrBufferLoad : DefaultAttrsIn llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1467,7 +1467,7 @@ class AMDGPUStructPtrAtomicBufferLoad : Intrinsi llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1485,7 +1485,7 @@ class AMDGPURawBufferStore : DefaultAttrsIntrins llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1503,7 +1503,7 @@ class AMDGPURawPtrBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1523,7 +1523,7 @@ class AMDGPUStructBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1542,7 +1542,7 @@ class AMDGPUStructPtrBufferStore : DefaultAttrsI llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1628,7 +1628,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; -// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx950, gfx12+. +// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx942, gfx950, gfx12+. def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic; class AMDGPUStructBufferAtomic : Intrinsic < @@ -1727,7 +1727,7 @@ def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz [IntrReadMem, @@ -1743,7 +1743,7 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1761,7 +1761,7 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1779,7 +1779,7 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1797,7 +1797,7 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1815,7 +1815,7 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1834,7 +1834,7 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1853,7 +1853,7 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1872,7 +1872,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1891,7 +1891,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1914,7 +1914,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1934,7 +1934,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -3007,7 +3007,7 @@ def int_amdgcn_fdot2_f32_bf16 : // f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + c // TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces -// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these. +// v_dot2c_f32_f16 on gfx942. Maybe we can consolidate these. def int_amdgcn_fdot2c_f32_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">, @@ -3250,7 +3250,7 @@ def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic; -// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. +// Note: in gfx942 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. // Three bits corresponding to the neg modifier applied to the respective // source operand. def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic; @@ -3258,7 +3258,7 @@ def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic : diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index 55e7b417428c4..f776b41f3d7ca 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -83,8 +83,6 @@ enum GPUKind : uint32_t { GK_GFX909 = 65, GK_GFX90A = 66, GK_GFX90C = 67, - GK_GFX940 = 68, - GK_GFX941 = 69, GK_GFX942 = 70, GK_GFX950 = 71, diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 2d3d70db50c39..ac25d76709726 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -545,10 +545,6 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx90a"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: return "gfx90c"; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: - return "gfx940"; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: - return "gfx941"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: return "gfx942"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 05e4d85b2ea5d..1f970739c1e7e 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -609,8 +609,6 @@ void ScalarBitSetTraits::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH); - BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH); - BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 6439149d801f6..3aabca49b249e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1619,28 +1619,6 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureAtomicBufferPkAddBF16Inst ])>; -def FeatureISAVersion9_4_0 : FeatureSet< - !listconcat(FeatureISAVersion9_4_Common.Features, - [ - FeatureAddressableLocalMemorySize65536, - FeatureForceStoreSC0SC1, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, - FeatureXF32Insts - ])>; - -def FeatureISAVersion9_4_1 : FeatureSet< - !listconcat(FeatureISAVersion9_4_Common.Features, - [ - FeatureAddressableLocalMemorySize65536, - FeatureForceStoreSC0SC1, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, - FeatureXF32Insts - ])>; - def FeatureISAVersion9_4_2 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3bbbbcf71d8ae..cf3843869808b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4295,7 +4295,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( // TODO: Handle G_FSUB 0 as fneg // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. - (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() + (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 9ca853befba73..d3487daee364f 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1773,7 +1773,7 @@ def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; -// GFX940+. +// GFX942+. def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index ea6e703eba5d9..7988a9ac0ce55 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -814,7 +814,7 @@ defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", } // End SubtargetPredicate = isGFX7GFX10GFX11 -// GFX940-, GFX11-only flat instructions. +// GFX942-, GFX11-only flat instructions. let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; } // End SubtargetPredicate = HasFlatAtomicFaddF32Inst @@ -2076,7 +2076,7 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x1f>; let SubtargetPredicate = isGFX8GFX9NotGFX940 in { - // These instructions are encoded differently on gfx90* and gfx940. + // These instructions are encoded differently on gfx90* and gfx94*. defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 827598078af53..1ff75095b220a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2292,7 +2292,7 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 @@ -2600,7 +2600,7 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 @@ -2610,7 +2610,7 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index a86c76bb6075e..0b372e29efe67 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -192,15 +192,7 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel, FeatureISAVersion9_0_C.Features >; -def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel, - FeatureISAVersion9_4_0.Features ->; - -def : ProcessorModel<"gfx941", SIDPGFX940FullSpeedModel, - FeatureISAVersion9_4_1.Features ->; - -def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, +def : ProcessorModel<"gfx942", SIDPGFX942FullSpeedModel, FeatureISAVersion9_4_2.Features >; @@ -213,8 +205,8 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel, FeatureISAVersion9_Generic.Features >; -// [gfx940, gfx941, gfx942] -def : ProcessorModel<"gfx9-4-generic", SIDPGFX940FullSpeedModel, +// [gfx942] +def : ProcessorModel<"gfx9-4-generic", SIDPGFX942FullSpeedModel, FeatureISAVersion9_4_Generic.Features >; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 342b211199dca..f7c5c472c93a5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1297,11 +1297,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPackedTID() const { return HasPackedTID; } - // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that + // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that // hasGFX90AInsts is also true. bool hasGFX940Insts() const { return GFX940Insts; } - // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that + // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that // hasGFX940Insts and hasGFX90AInsts are also true. bool hasGFX950Insts() const { return GFX950Insts; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 059bab5838526..4a4ad712e304d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -93,8 +93,6 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; @@ -180,8 +178,6 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; - case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; - case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941; case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942; case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index f812ae652b63d..721601efcc804 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -542,7 +542,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_EXCP_FLAG_USER = 18, ID_TRAP_CTRL = 19, - // GFX940 specific registers + // GFX94* specific registers ID_XCC_ID = 20, ID_SQ_PERF_SNAPSHOT_DATA = 21, ID_SQ_PERF_SNAPSHOT_DATA1 = 22, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e09b310d107ac..909ad07782fc6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16823,39 +16823,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // safe. The message phrasing also should be better. if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { if (AS == AMDGPUAS::FLAT_ADDRESS) { - // gfx940, gfx12 + // gfx942, gfx12 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { - // gfx90a, gfx940, gfx12 + // gfx90a, gfx942, gfx12 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // gfx940, gfx12 + // gfx942, gfx12 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // gfx90a, gfx940, gfx12 + // gfx90a, gfx942, gfx12 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for + // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for // buffer. gfx12 does have the buffer version. if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } - // global and flat atomic fadd f64: gfx90a, gfx940. + // global and flat atomic fadd f64: gfx90a, gfx942. if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) return ReportUnsafeHWInst(AtomicExpansionKind::None); if (AS != AMDGPUAS::FLAT_ADDRESS) { if (Ty->isFloatTy()) { - // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, + // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942, // gfx11+. if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+. if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else { @@ -16867,7 +16867,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } } - // flat atomic fadd f32: gfx940, gfx11+. + // flat atomic fadd f32: gfx942, gfx11+. if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { if (Subtarget->hasFlatAtomicFaddF32Inst()) return ReportUnsafeHWInst(AtomicExpansionKind::None); @@ -16906,7 +16906,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // float, double restored in gfx10. // double removed again in gfx11, so only f32 for gfx11/gfx12. // - // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but + // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but // no f32. if (AS == AMDGPUAS::FLAT_ADDRESS) { if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index be6cff873532b..79fb36acc0ea7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -492,7 +492,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { } public: - SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 117add324db56..2a374b360b04a 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -94,7 +94,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; -def SIDPGFX940FullSpeedModel : SISchedMachineModel; +def SIDPGFX942FullSpeedModel : SISchedMachineModel; def SIDPGFX950FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; def GFX11SpeedModel : SISchedMachineModel; @@ -276,7 +276,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; } // End SchedModel = SIDPFullSpeedModel -let SchedModel = SIDPGFX940FullSpeedModel in { +let SchedModel = SIDPGFX942FullSpeedModel in { defm : SICommonWriteRes; @@ -308,7 +308,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; -} // End SchedModel = SIDPGFX940FullSpeedModel +} // End SchedModel = SIDPGFX942FullSpeedModel let SchedModel = SIDPGFX950FullSpeedModel in { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index a8e4ce133ffbc..e433b85489e6e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -216,7 +216,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, - // GFX940 specific registers + // GFX942 specific registers {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940}, diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 0a605dfd017cb..8731a16b88a5c 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -104,8 +104,6 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, {{"gfx90a"}, {"gfx90a"}, GK_GFX90A, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx90c"}, {"gfx90c"}, GK_GFX90C, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, - {{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, - {{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, @@ -260,8 +258,6 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX909: return {9, 0, 9}; case GK_GFX90A: return {9, 0, 10}; case GK_GFX90C: return {9, 0, 12}; - case GK_GFX940: return {9, 4, 0}; - case GK_GFX941: return {9, 4, 1}; case GK_GFX942: return {9, 4, 2}; case GK_GFX950: return {9, 5, 0}; case GK_GFX1010: return {10, 1, 0}; @@ -506,8 +502,6 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx950-insts"] = true; [[fallthrough]]; case GK_GFX942: - case GK_GFX941: - case GK_GFX940: Features["fp8-insts"] = true; Features["fp8-conversion-insts"] = true; if (Kind != GK_GFX950) diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll index b008f397318e8..89c9801b5e466 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ ; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s ; Confirms we do not leave behind a declaration which references the same diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 2da08127f20a8..fdae09ac767e6 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1624,8 +1624,6 @@ const EnumEntry ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"), \ - ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"), \ - ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"), \ From 17602893409a0d396d37162a3b42254689e02e09 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 19 Feb 2025 10:22:37 +0100 Subject: [PATCH 035/220] [clang][bytecode] Fix three-way unordered non-pointer comparisions (#127759) This _can_ happen with non-pointers, but we shouldn't diagnose it in that case. --- clang/lib/AST/ByteCode/Interp.h | 15 ++++++++------- clang/test/AST/ByteCode/cxx20.cpp | 2 ++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index ca74046038072..fa113aa0bb157 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1132,13 +1132,14 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) { const Pointer &P = S.Stk.peek(); ComparisonCategoryResult CmpResult = LHS.compare(RHS); - if (CmpResult == ComparisonCategoryResult::Unordered) { - // This should only happen with pointers. - const SourceInfo &Loc = S.Current->getSource(OpPC); - S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified) - << LHS.toDiagnosticString(S.getASTContext()) - << RHS.toDiagnosticString(S.getASTContext()); - return false; + if constexpr (std::is_same_v) { + if (CmpResult == ComparisonCategoryResult::Unordered) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified) + << LHS.toDiagnosticString(S.getASTContext()) + << RHS.toDiagnosticString(S.getASTContext()); + return false; + } } assert(CmpInfo); diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 6f65fa5c7cfd3..06501de64916a 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -626,6 +626,8 @@ namespace ThreeWayCmp { constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}} static_assert(k== 0, ""); + static_assert(__builtin_nanf("") <=> __builtin_nanf("") == -127, ""); + /// Pointers. constexpr int a[] = {1,2,3}; constexpr int b[] = {1,2,3}; From 2260d592578082abd0e1f57bab7d4d9b18b687d1 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 10:26:09 +0100 Subject: [PATCH 036/220] [AMDGPU] Remove FeatureForceStoreSC0SC1 (#126878) This was only used for gfx940 and gfx941, which have since been removed. For SWDEV-512631 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ------ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 -------------------- 3 files changed, 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3aabca49b249e..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f7c5c472c93a5..6664a70572ded 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 79fb36acc0ea7..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { - return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -517,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { - bool Changed = false; - if (ST.hasForceStoreSC0SC1() && - (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | - SIAtomicAddrSpace::GLOBAL | - SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - } - return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2820,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) From db597084c5dbbf896d121b74ba80a7a76250fc78 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 10:31:47 +0100 Subject: [PATCH 037/220] [AMDGPU][docs] Replace gfx940 and gfx941 with gfx942 in llvm/docs (#126887) gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. This PR removes all documentation occurrences of gfx940/gfx941 except for the gfx940 ISA description, which will be the subject of a separate PR. For SWDEV-512631 --- llvm/docs/AMDGPUOperandSyntax.rst | 4 +- llvm/docs/AMDGPUUsage.rst | 97 ++++++++++--------------------- 2 files changed, 34 insertions(+), 67 deletions(-) diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 936e8e2960bf1..d580be1eb8cfc 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ ----------------------------------------------------------------------------------------------------------------------- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa* Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: - - kernarg preload - Packed - work-item Add product - IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: - - kernarg preload - Packed - work-item Add product - IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``v_dot2_f32_f16`` - ``gfx9-4-generic`` ``amdgcn`` - ``gfx940`` - sramecc - Architected FP8 and BF8 instructions, - - ``gfx941`` - tgsplit flat scratch FP8 and BF8 conversion - - ``gfx942`` - xnack - Packed instructions, as well as - - ``gfx950`` - kernarg preload work-item instructions with XF32 format + ``gfx9-4-generic`` ``amdgcn`` - ``gfx942`` - sramecc - Architected FP8 and BF8 instructions, + - ``gfx950`` - tgsplit flat scratch FP8 and BF8 conversion + - xnack - Packed instructions, as well as + - kernarg preload work-item instructions with XF32 format IDs support are not available. ``gfx10-1-generic`` ``amdgcn`` - ``gfx1010`` - xnack - Absolute flat - The following instructions are @@ -4985,7 +4971,7 @@ The fields used by CP for code objects before V3 also match those specified in bytes 383:352 4 bytes COMPUTE_PGM_RSRC3 GFX6-GFX9 Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 Compute Shader (CS) program settings used by CP to set up @@ -5070,7 +5056,7 @@ The fields used by CP for code objects before V3 also match those specified in 463:460 4 bits Reserved, must be 0. 470:464 7 bits KERNARG_PRELOAD_SPEC_LENGTH GFX6-GFX9 - Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 - The number of dwords from the kernarg segment to preload into User SGPRs before kernel @@ -5078,7 +5064,7 @@ The fields used by CP for code objects before V3 also match those specified in :ref:`amdgpu-amdhsa-kernarg-preload`). 479:471 9 bits KERNARG_PRELOAD_SPEC_OFFSET GFX6-GFX9 - Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 - An offset in dwords into the kernarg segment to begin preloading data into User @@ -5104,7 +5090,7 @@ The fields used by CP for code objects before V3 also match those specified in GFX6-GFX9 - vgprs_used 0..256 - max(0, ceil(vgprs_used / 4) - 1) - GFX90A, GFX940 + GFX90A, GFX942 - vgprs_used 0..512 - vgprs_used = align(arch_vgprs, 4) + acc_vgprs @@ -5570,7 +5556,7 @@ The fields used by CP for code objects before V3 also match those specified in .. - .. table:: compute_pgm_rsrc3 for GFX90A, GFX940 + .. table:: compute_pgm_rsrc3 for GFX90A, GFX942 :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table ======= ======= =============================== =========================================================================== @@ -9981,15 +9967,15 @@ only accessed by a single thread, and is always write-before-read, there is never a need to invalidate these entries from the L1 cache. Hence all cache invalidates are done as ``*_vol`` to only invalidate the volatile cache lines. -The code sequences used to implement the memory model for GFX940, GFX941, GFX942 -are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx941-gfx942-table`. +The code sequences used to implement the memory model for GFX942 are defined in +table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx942-table`. - .. table:: AMDHSA Memory Model Code Sequences GFX940, GFX941, GFX942 - :name: amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx941-gfx942-table + .. table:: AMDHSA Memory Model Code Sequences GFX942 + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx942-table ============ ============ ============== ========== ================================ LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code - Ordering Sync Scope Address GFX940, GFX941, GFX942 + Ordering Sync Scope Address GFX942 Space ============ ============ ============== ========== ================================ **Non-Atomic** @@ -10024,18 +10010,12 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 load *none* *none* - local 1. ds_load store *none* *none* - global - !volatile & !nontemporal - generic - - private 1. GFX940, GFX941 + - private 1. GFX942 - constant buffer/global/flat_store - sc0=1 sc1=1 - GFX942 - buffer/global/flat_store - !volatile & nontemporal - 1. GFX940, GFX941 - buffer/global/flat_store - nt=1 sc0=1 sc1=1 - GFX942 + 1. GFX942 buffer/global/flat_store nt=1 @@ -10707,11 +10687,8 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 **Release Atomic** ------------------------------------------------------------------------------------ - store atomic release - singlethread - global 1. GFX940, GFX941 + store atomic release - singlethread - global 1. GFX942 - wavefront - generic buffer/global/flat_store - sc0=1 sc1=1 - GFX942 - buffer/global/flat_store store atomic release - singlethread - local *If TgSplit execution mode, - wavefront local address space cannot @@ -10749,10 +10726,7 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 store that is being released. - 2. GFX940, GFX941 - buffer/global/flat_store - sc0=1 sc1=1 - GFX942 + 2. GFX942 buffer/global/flat_store sc0=1 store atomic release - workgroup - local *If TgSplit execution mode, @@ -10813,10 +10787,7 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 store that is being released. - 3. GFX940, GFX941 - buffer/global/flat_store - sc0=1 sc1=1 - GFX942 + 3. GFX942 buffer/global/flat_store sc1=1 store atomic release - system - global 1. buffer_wbl2 sc0=1 sc1=1 @@ -17574,11 +17545,7 @@ in this description. CDNA 2 :doc:`GFX9` :doc:`gfx90a` - CDNA 3 :doc:`GFX9` :doc:`gfx940` - - :doc:`gfx941` - - :doc:`gfx942` + CDNA 3 :doc:`GFX9` :doc:`gfx942` RDNA 1 :doc:`GFX10 RDNA1` :doc:`gfx1010` @@ -17616,7 +17583,7 @@ combinations of operands, refer to one of instruction set architecture manuals [AMD-GCN-GFX6]_, [AMD-GCN-GFX7]_, [AMD-GCN-GFX8]_, [AMD-GCN-GFX900-GFX904-VEGA]_, [AMD-GCN-GFX906-VEGA7NM]_, [AMD-GCN-GFX908-CDNA1]_, [AMD-GCN-GFX90A-CDNA2]_, -[AMD-GCN-GFX940-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, +[AMD-GCN-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, [AMD-GCN-GFX11-RDNA3]_ and [AMD-GCN-GFX11-RDNA3.5]_. Operands @@ -18129,7 +18096,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table` ``.amdhsa_user_sgpr_private_segment_buffer`` 0 GFX6-GFX10 Controls ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER in (except :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - GFX940) + GFX942) ``.amdhsa_user_sgpr_dispatch_ptr`` 0 GFX6-GFX12 Controls ENABLE_SGPR_DISPATCH_PTR in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_queue_ptr`` 0 GFX6-GFX12 Controls ENABLE_SGPR_QUEUE_PTR in @@ -18140,7 +18107,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_flat_scratch_init`` 0 GFX6-GFX10 Controls ENABLE_SGPR_FLAT_SCRATCH_INIT in (except :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - GFX940) + GFX942) ``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in @@ -18151,8 +18118,8 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_system_sgpr_private_segment_wavefront_offset`` 0 GFX6-GFX10 Controls ENABLE_PRIVATE_SEGMENT in (except :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. - GFX940) - ``.amdhsa_enable_private_segment`` 0 GFX940, Controls ENABLE_PRIVATE_SEGMENT in + GFX942) + ``.amdhsa_enable_private_segment`` 0 GFX942, Controls ENABLE_PRIVATE_SEGMENT in GFX11-GFX12 :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_system_sgpr_workgroup_id_x`` 1 GFX6-GFX12 Controls ENABLE_SGPR_WORKGROUP_ID_X in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. @@ -18173,14 +18140,14 @@ terminated by an ``.end_amdhsa_kernel`` directive. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_accum_offset`` Required GFX90A, Offset of a first AccVGPR in the unified register file. - GFX940 Used to calculate ACCUM_OFFSET in + GFX942 Used to calculate ACCUM_OFFSET in :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. ``.amdhsa_reserve_vcc`` 1 GFX6-GFX12 Whether the kernel may use the special VCC SGPR. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_reserve_flat_scratch`` 1 GFX7-GFX10 Whether the kernel may use flat instructions to access (except scratch memory. Used to calculate - GFX940) GRANULATED_WAVEFRONT_SGPR_COUNT in + GFX942) GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_reserve_xnack_mask`` Target GFX8-GFX10 Whether the kernel may trigger XNACK replay. Feature Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in @@ -18211,7 +18178,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. ``.amdhsa_fp16_overflow`` 0 GFX9-GFX12 Controls FP16_OVFL in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_tg_split`` Target GFX90A, Controls TG_SPLIT in - Feature GFX940, :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. + Feature GFX942, :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. Specific GFX11-GFX12 (tgsplit) ``.amdhsa_workgroup_processor_mode`` Target GFX10-GFX12 Controls ENABLE_WGP_MODE in @@ -18242,9 +18209,9 @@ terminated by an ``.end_amdhsa_kernel`` directive. ``.amdhsa_exception_int_div_zero`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_user_sgpr_kernarg_preload_length`` 0 GFX90A, Controls KERNARG_PRELOAD_SPEC_LENGTH in - GFX940 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. + GFX942 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_kernarg_preload_offset`` 0 GFX90A, Controls KERNARG_PRELOAD_SPEC_OFFSET in - GFX940 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. + GFX942 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ======================================================== =================== ============ =================== .amdgpu_metadata @@ -18414,7 +18381,7 @@ Additional Documentation .. [AMD-GCN-GFX906-VEGA7NM] `AMD Vega 7nm Instruction Set Architecture `__ .. [AMD-GCN-GFX908-CDNA1] `AMD Instinct MI100 Instruction Set Architecture `__ .. [AMD-GCN-GFX90A-CDNA2] `AMD Instinct MI200 Instruction Set Architecture `__ -.. [AMD-GCN-GFX940-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture `__ +.. [AMD-GCN-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture `__ .. [AMD-GCN-GFX10-RDNA1] `AMD RDNA 1.0 Instruction Set Architecture `__ .. [AMD-GCN-GFX10-RDNA2] `AMD RDNA 2 Instruction Set Architecture `__ .. [AMD-GCN-GFX11-RDNA3] `AMD RDNA 3 Instruction Set Architecture `__ From 0607f94280820d38d4f120952a6a09891a046853 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 19 Feb 2025 09:34:15 +0000 Subject: [PATCH 038/220] [X86] getFauxShuffleMask - add support for vXi64/vXf64 concat_vectors decoding (#127630) Similar to insert_subvector - limit this to vXi64 vector cases to make the most of cross lane shuffles (for now). --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++ .../vector-interleaved-store-i8-stride-7.ll | 16 +-- .../vector-interleaved-store-i8-stride-8.ll | 134 +++++++++--------- .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 61 ++++++-- .../CodeGen/X86/zero_extend_vector_inreg.ll | 28 ++-- 5 files changed, 150 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dc25af9e2f1d5..7485fc48f4132 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6110,6 +6110,19 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(N1); return true; } + case ISD::CONCAT_VECTORS: { + // Limit this to vXi64 vector cases to make the most of cross lane shuffles. + unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements(); + if (NumBitsPerElt == 64) { + for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) { + for (unsigned M = 0; M != NumSubElts; ++M) + Mask.push_back((I * NumElts) + M); + Ops.push_back(N.getOperand(I)); + } + return true; + } + return false; + } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index be83db26aa7ed..89ed0040a71c2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1215,10 +1215,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -1294,10 +1294,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 251139161e46f..75c470a6d40c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1161,23 +1161,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1231,23 +1231,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2126,41 +2126,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2231,41 +2230,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index da65fecba773b..d6208aca3b2b7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1654,12 +1654,27 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> @@ -1667,12 +1682,34 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { } define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_1032_v2i64: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index f7c29cba30bd5..6b1d118ca97ad 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -6173,13 +6173,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6209,13 +6209,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3] +; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper From c442b39770baa397a98ecfb79719cc09081e4a81 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 19 Feb 2025 10:37:48 +0100 Subject: [PATCH 039/220] [AMDGPU][docs][NFC] Replace gfx940 with gfx942 in the gfx940 ISA doc (#126906) gfx940 and gfx941 are no longer supported. This is the last one of a series of PRs to remove them from the code base. The ISA documentation still contains a lot of links and file names with the "gfx940" identifier. Changing them to "gfx942" is probably not worth the cost of breaking all URLs to these pages that users might have saved in the past. For SWDEV-512631 --- llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst index 90797499dec22..7603bcc95383b 100644 --- a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst +++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst @@ -6,7 +6,7 @@ ************************************************** ==================================================================================== -Syntax of gfx940 Instructions +Syntax of gfx942 Instructions ==================================================================================== .. contents:: @@ -15,7 +15,7 @@ Syntax of gfx940 Instructions Introduction ============ -This document describes the syntax of gfx940 instructions. +This document describes the syntax of gfx942 instructions. Notation ======== From 160da73b8583b2f94cba81b220710684a9590da3 Mon Sep 17 00:00:00 2001 From: Thirumalai Shaktivel <74826228+Thirumalai-Shaktivel@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:21:31 +0530 Subject: [PATCH 040/220] [Flang] Check if two ArrayConstructor's are Equal (#121181) This also includes comparing the two ImpliedDo Details - For ArrayConstructor, check if x and y have the same elements and type - For ImpliedDo, check if x and y have the same lower, upper, stride and values Fixes: https://github.com/llvm/llvm-project/issues/104526 --- flang/lib/Lower/Support/Utils.cpp | 40 ++++++++++++++++++++++- flang/test/Lower/OpenMP/atomic-update.f90 | 15 +++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp index 5a9a839330364..ed2700c42fc55 100644 --- a/flang/lib/Lower/Support/Utils.cpp +++ b/flang/lib/Lower/Support/Utils.cpp @@ -478,9 +478,47 @@ class IsEqualEvaluateExpr { return isEqual(x.proc(), y.proc()) && isEqual(x.arguments(), y.arguments()); } template + static bool isEqual(const Fortran::evaluate::ImpliedDo &x, + const Fortran::evaluate::ImpliedDo &y) { + return isEqual(x.values(), y.values()) && isEqual(x.lower(), y.lower()) && + isEqual(x.upper(), y.upper()) && isEqual(x.stride(), y.stride()); + } + template + static bool isEqual(const Fortran::evaluate::ArrayConstructorValues &x, + const Fortran::evaluate::ArrayConstructorValues &y) { + using Expr = Fortran::evaluate::Expr; + using ImpliedDo = Fortran::evaluate::ImpliedDo; + for (const auto &[xValue, yValue] : llvm::zip(x, y)) { + bool checkElement = Fortran::common::visit( + common::visitors{ + [&](const Expr &v, const Expr &w) { return isEqual(v, w); }, + [&](const ImpliedDo &v, const ImpliedDo &w) { + return isEqual(v, w); + }, + [&](const Expr &, const ImpliedDo &) { return false; }, + [&](const ImpliedDo &, const Expr &) { return false; }, + }, + xValue.u, yValue.u); + if (!checkElement) { + return false; + } + } + return true; + } + static bool isEqual(const Fortran::evaluate::SubscriptInteger &x, + const Fortran::evaluate::SubscriptInteger &y) { + return x == y; + } + template static bool isEqual(const Fortran::evaluate::ArrayConstructor &x, const Fortran::evaluate::ArrayConstructor &y) { - llvm::report_fatal_error("not implemented"); + bool checkCharacterType = true; + if constexpr (A::category == Fortran::common::TypeCategory::Character) { + checkCharacterType = isEqual(*x.LEN(), *y.LEN()); + } + using Base = Fortran::evaluate::ArrayConstructorValues; + return isEqual((Base)x, (Base)y) && + (x.GetType() == y.GetType() && checkCharacterType); } static bool isEqual(const Fortran::evaluate::ImpliedDoIndex &x, const Fortran::evaluate::ImpliedDoIndex &y) { diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90 index 16dae9d5f301c..7d04745015faa 100644 --- a/flang/test/Lower/OpenMP/atomic-update.f90 +++ b/flang/test/Lower/OpenMP/atomic-update.f90 @@ -185,4 +185,19 @@ program OmpAtomicUpdate !$omp atomic update w = max(w,x,y,z) +!CHECK: %[[IMP_DO:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr { +!CHECK: ^bb0(%{{.*}}: index): +! [...] +!CHECK: %[[ADD_I1:.*]] = arith.addi {{.*}} : i32 +!CHECK: hlfir.yield_element %[[ADD_I1]] : i32 +!CHECK: } +! [...] +!CHECK: %[[SUM:.*]] = hlfir.sum %[[IMP_DO]] +!CHECK: omp.atomic.update %[[VAL_X_DECLARE]]#1 : !fir.ref { +!CHECK: ^bb0(%[[ARG0:.*]]: i32): +!CHECK: %[[ADD_I2:.*]] = arith.addi %[[ARG0]], %[[SUM]] : i32 +!CHECK: omp.yield(%[[ADD_I2]] : i32) +!CHECK: } + !$omp atomic update + x = x + sum([ (y+2, y=1, z) ]) end program OmpAtomicUpdate From 7781e1040dc5b4a920628ee086e239cf46f74e49 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 19 Feb 2025 10:13:41 +0000 Subject: [PATCH 041/220] [clang] Lower non-builtin sincos[f|l] calls to llvm.sincos.* when -fno-math-errno is set (#121763) This will allow vectorizing these calls (after a few more patches). This should not change the codegen for targets that enable the use of AA during the codegen (in `TargetSubtargetInfo::useAA()`). This includes targets such as AArch64. This notably does not include x86 but can be worked around by passing `-mllvm -combiner-global-alias-analysis=true` to clang. Follow up to #114086. --- clang/lib/CodeGen/CGBuiltin.cpp | 5 ++++ clang/test/CodeGen/AArch64/sincos.c | 46 +++++++++++++++++++++++++++++ clang/test/CodeGen/math-libcalls.c | 11 +++++++ 3 files changed, 62 insertions(+) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 348cb523b1718..0bf8c845b307f 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3377,11 +3377,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(emitUnaryMaybeConstrainedFPBuiltin( *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh)); + case Builtin::BIsincos: + case Builtin::BIsincosf: + case Builtin::BIsincosl: case Builtin::BI__builtin_sincos: case Builtin::BI__builtin_sincosf: case Builtin::BI__builtin_sincosf16: case Builtin::BI__builtin_sincosl: case Builtin::BI__builtin_sincosf128: + if (Builder.getIsFPConstrained()) + break; // TODO: Emit constrained sincos intrinsic once one exists. emitSincosBuiltin(*this, E, Intrinsic::sincos); return RValue::get(nullptr); diff --git a/clang/test/CodeGen/AArch64/sincos.c b/clang/test/CodeGen/AArch64/sincos.c index b77d98ceab486..736c0892ed741 100644 --- a/clang/test/CodeGen/AArch64/sincos.c +++ b/clang/test/CodeGen/AArch64/sincos.c @@ -1,6 +1,10 @@ // RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -O1 %s -o - | FileCheck --check-prefix=NO-MATH-ERRNO %s // RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -fmath-errno %s -o - | FileCheck --check-prefix=MATH-ERRNO %s +void sincos(double, double*, double*); +void sincosf(float, float*, float*); +void sincosl(long double, long double*, long double*); + // NO-MATH-ERRNO-LABEL: @sincos_f32 // NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}}) // NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0 @@ -12,6 +16,20 @@ // MATH-ERRNO: call void @sincosf( // void sincos_f32(float x, float* fp0, float* fp1) { + sincosf(x, fp0, fp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f32 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { float, float } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store float [[SIN]], ptr {{.*}}, align 4, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store float [[COS]], ptr {{.*}}, align 4, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f32 +// MATH-ERRNO: call void @sincosf( +// +void sincos_builtin_f32(float x, float* fp0, float* fp1) { __builtin_sincosf(x, fp0, fp1); } @@ -26,6 +44,20 @@ void sincos_f32(float x, float* fp0, float* fp1) { // MATH-ERRNO: call void @sincos( // void sincos_f64(double x, double* dp0, double* dp1) { + sincos(x, dp0, dp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f64 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { double, double } @llvm.sincos.f64(double {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { double, double } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { double, double } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store double [[SIN]], ptr {{.*}}, align 8, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store double [[COS]], ptr {{.*}}, align 8, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f64 +// MATH-ERRNO: call void @sincos( +// +void sincos_builtin_f64(double x, double* dp0, double* dp1) { __builtin_sincos(x, dp0, dp1); } @@ -40,5 +72,19 @@ void sincos_f64(double x, double* dp0, double* dp1) { // MATH-ERRNO: call void @sincosl( // void sincos_f128(long double x, long double* ldp0, long double* ldp1) { + sincosl(x, ldp0, ldp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f128 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { fp128, fp128 } @llvm.sincos.f128(fp128 {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store fp128 [[SIN]], ptr {{.*}}, align 16, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store fp128 [[COS]], ptr {{.*}}, align 16, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f128 +// MATH-ERRNO: call void @sincosl( +// +void sincos_builtin_f128(long double x, long double* ldp0, long double* ldp1) { __builtin_sincosl(x, ldp0, ldp1); } diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index bcc61c8f046b4..14fdee77f4d78 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -660,6 +660,17 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // HAS_MAYTRAP: declare float @llvm.experimental.constrained.sinh.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.sinh.f80( +sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); + +// NO__ERRNO: declare { double, double } @llvm.sincos.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { float, float } @llvm.sincos.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] sqrt(f); sqrtf(f); sqrtl(f); From 1509b46ea5617402634270a01e840356ef935436 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 19 Feb 2025 10:24:12 +0000 Subject: [PATCH 042/220] [libclc] Improve nextafter behaviour around zero (#127469) This commit improves the behaviour of (__clc_)nextafter around zero. Specifically, the nextafter value of very small negative numbers in the positive direction is now negative zero. Previously we'd return positive zero. This behaviour is not required as far as OpenCL is concerned: at least, the CTS isn't testing for it. However, this change does bring our implementation into bit-equivalence with (libstdc++'s implementation of) std::nextafter, tested on all possible values of 32-bit float towards both positive and negative INFINITY. Furthermore, since the implementation of libclc's floating-point 'rtp' and 'rtz' conversions use __clc_nextafter, the previous behaviour was resulting in CTS validation issues. For example, when converting float -0x1.000002p-25 to half, rounding towards zero or positive infinity, nextafter was returning +0.0, whereas the correct conversion requires us to return -0.0. We could work around this issue in the conversion functions, but since the change to nextafter is small enough and the behaviour around zero matches libstdc++, the fix feels at home there. This commit also converts several variables to unsigned types to avoid undefined behaviour surrounding signed underflow on the subtractions. It also converts some variables to be kept in floating-point types, using fabs to get the absolute value rather than by bit-hacking. --- libclc/clc/lib/generic/math/clc_nextafter.cl | 36 +++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_nextafter.cl b/libclc/clc/lib/generic/math/clc_nextafter.cl index 58125485bf684..f77f3647d5985 100644 --- a/libclc/clc/lib/generic/math/clc_nextafter.cl +++ b/libclc/clc/lib/generic/math/clc_nextafter.cl @@ -1,5 +1,6 @@ #include #include +#include #include // This file provides OpenCL C implementations of __clc_nextafter for @@ -12,21 +13,30 @@ FLOAT_TYPE y) { \ const UINT_TYPE sign_bit = (UINT_TYPE)1 \ << (sizeof(INT_TYPE_SCALAR) * 8 - 1); \ - const UINT_TYPE sign_bit_mask = sign_bit - (UINT_TYPE)1; \ - INT_TYPE ix = CLC_AS_TYPE(INT_TYPE)(x); \ - UINT_TYPE ax = CLC_AS_TYPE(UINT_TYPE)(ix) & sign_bit_mask; \ - INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(sign_bit) - ix; \ - mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx : ix; \ - INT_TYPE iy = CLC_AS_TYPE(INT_TYPE)(y); \ - UINT_TYPE ay = CLC_AS_TYPE(UINT_TYPE)(iy) & sign_bit_mask; \ - INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(sign_bit) - iy; \ - my = iy < (INT_TYPE)0 ? my : iy; \ + UINT_TYPE ix = CLC_AS_TYPE(UINT_TYPE)(x); \ + FLOAT_TYPE absx = __clc_fabs(x); \ + UINT_TYPE mxu = sign_bit - ix; \ + INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(mxu); \ + mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx \ + : CLC_AS_TYPE(INT_TYPE)(ix); \ + UINT_TYPE iy = CLC_AS_TYPE(UINT_TYPE)(y); \ + FLOAT_TYPE absy = __clc_fabs(y); \ + UINT_TYPE myu = sign_bit - iy; \ + INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(myu); \ + my = CLC_AS_TYPE(INT_TYPE)(iy) < (INT_TYPE)0 ? my \ + : CLC_AS_TYPE(INT_TYPE)(iy); \ INT_TYPE t = mx + (mx < my ? (INT_TYPE)1 : (INT_TYPE)-1); \ - INT_TYPE r = CLC_AS_TYPE(INT_TYPE)(sign_bit) - t; \ - r = t < (INT_TYPE)0 ? r : t; \ + UINT_TYPE r = sign_bit - CLC_AS_TYPE(UINT_TYPE)(t); \ + r = (t < (INT_TYPE)0 || (t == (INT_TYPE)0 && mx < my)) \ + ? r \ + : CLC_AS_TYPE(UINT_TYPE)(t); \ r = __clc_isnan(x) ? ix : r; \ - r = __clc_isnan(y) ? CLC_AS_TYPE(INT_TYPE)(iy) : r; \ - r = ((ax | ay) == (UINT_TYPE)0 || ix == iy) ? iy : r; \ + r = __clc_isnan(y) ? iy : r; \ + r = ((CLC_AS_TYPE(UINT_TYPE)(absx) | CLC_AS_TYPE(UINT_TYPE)(absy)) == \ + (UINT_TYPE)0 || \ + ix == iy) \ + ? iy \ + : r; \ return CLC_AS_TYPE(FLOAT_TYPE)(r); \ } From 3c938d0d534221c44bd2477e7f95c7b5bbb6fc7a Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Wed, 19 Feb 2025 15:56:23 +0530 Subject: [PATCH 043/220] [MLIR][Affine] Fix affine.parallel op verifier (#127611) Fix affine.parallel op verifier for missing check on zero result lower or upper bound maps. lb/ub maps should have at least one result. Fixes: https://github.com/llvm/llvm-project/issues/120186 --- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 18 ++++++++++++++---- mlir/test/Dialect/Affine/invalid.mlir | 18 ++++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 147f5dd7a24b6..cfc51ad2a1524 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -3918,14 +3918,24 @@ LogicalResult AffineParallelOp::verify() { } unsigned expectedNumLBResults = 0; - for (APInt v : getLowerBoundsGroups()) - expectedNumLBResults += v.getZExtValue(); + for (APInt v : getLowerBoundsGroups()) { + unsigned results = v.getZExtValue(); + if (results == 0) + return emitOpError() + << "expected lower bound map to have at least one result"; + expectedNumLBResults += results; + } if (expectedNumLBResults != getLowerBoundsMap().getNumResults()) return emitOpError() << "expected lower bounds map to have " << expectedNumLBResults << " results"; unsigned expectedNumUBResults = 0; - for (APInt v : getUpperBoundsGroups()) - expectedNumUBResults += v.getZExtValue(); + for (APInt v : getUpperBoundsGroups()) { + unsigned results = v.getZExtValue(); + if (results == 0) + return emitOpError() + << "expected upper bound map to have at least one result"; + expectedNumUBResults += results; + } if (expectedNumUBResults != getUpperBoundsMap().getNumResults()) return emitOpError() << "expected upper bounds map to have " << expectedNumUBResults << " results"; diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir index 44e484b9ba598..da2913e3fec28 100644 --- a/mlir/test/Dialect/Affine/invalid.mlir +++ b/mlir/test/Dialect/Affine/invalid.mlir @@ -297,6 +297,24 @@ func.func @affine_parallel(%arg0 : index, %arg1 : index, %arg2 : index) { // ----- +func.func @no_upper_bound_affine_parallel() { + // expected-error@+1 {{expected lower bound map to have at least one result}} + affine.parallel (%arg2) = (max()) to (1) { + } + return +} + +// ----- + +func.func @no_upper_bound_affine_parallel() { + // expected-error@+1 {{expected upper bound map to have at least one result}} + affine.parallel (%arg3) = (0) to (min()) { + } + return +} + +// ----- + func.func @vector_load_invalid_vector_type() { %0 = memref.alloc() : memref<100xf32> affine.for %i0 = 0 to 16 step 8 { From 0de2ccab7be85aeeefcd5757d29126246c373731 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Wed, 19 Feb 2025 10:39:43 +0000 Subject: [PATCH 044/220] [flang][OpenMP]Improve support for DECLARE REDUCTION (#127088) Part of the DECLARE REDUCTION was already supported by the parser, but the semantics to add the reduction identifier wasn't implemented. The semantics would not accept the name given by the reduction, so a few lines added to support that. Some tests were in place but not quite working, so fixed those up too. Adding new tests for unparsing and parse-tree, as well as checking the symbolic name being generated. Lowering of DECLARE REDUCTION is not supported in this patch, and a test that it hits the relevant TODO is in this patch (most of this was already existing, but not actually testing the TODO message). --- flang/include/flang/Parser/parse-tree.h | 4 ++-- flang/lib/Parser/openmp-parsers.cpp | 8 +++---- flang/lib/Parser/unparse.cpp | 5 ++--- flang/lib/Semantics/check-omp-structure.cpp | 4 ++++ flang/lib/Semantics/resolve-directives.cpp | 9 ++++++++ flang/lib/Semantics/resolve-names.cpp | 19 +++++++++++++++- .../OpenMP/Todo/omp-declare-reduction.f90 | 4 ++-- .../OpenMP/declare-reduction-unparse.f90 | 21 ++++++++++++++++++ .../OpenMP/declarative-directive01.f90 | 22 ++++++++----------- .../Semantics/OpenMP/declare-reduction.f90 | 11 ++++++++++ 10 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 flang/test/Parser/OpenMP/declare-reduction-unparse.f90 create mode 100644 flang/test/Semantics/OpenMP/declare-reduction.f90 diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index c3a02fca5ade8..6ba43f6688c25 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4553,8 +4553,8 @@ WRAPPER_CLASS(OmpReductionInitializerClause, Expr); struct OpenMPDeclareReductionConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPDeclareReductionConstruct); CharBlock source; - std::tuple, - OmpReductionCombiner, std::optional> + std::tuple, + std::optional> t; }; diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 2b6c77c08cc58..b39b8737b70c0 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -170,8 +170,8 @@ TYPE_PARSER(sourced( // TYPE_PARSER(construct(nonemptyList(Parser{}))) TYPE_PARSER( // - construct(Parser{}) || - construct(Parser{})) + construct(Parser{}) || + construct(Parser{})) TYPE_PARSER(construct( // Parser{}, @@ -1148,9 +1148,7 @@ TYPE_PARSER(construct( // 2.16 Declare Reduction Construct TYPE_PARSER(sourced(construct( verbatim("DECLARE REDUCTION"_tok), - "(" >> Parser{} / ":", - nonemptyList(Parser{}) / ":", - Parser{} / ")", + "(" >> indirect(Parser{}) / ")", maybe(Parser{})))) // declare-target with list diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index cd91fbe4ea5eb..3d00979d7b7a6 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2690,11 +2690,10 @@ class UnparseVisitor { BeginOpenMP(); Word("!$OMP DECLARE REDUCTION "); Put("("); - Walk(std::get(x.t)), Put(" : "); - Walk(std::get>(x.t), ","), Put(" : "); - Walk(std::get(x.t)); + Walk(std::get>(x.t)); Put(")"); Walk(std::get>(x.t)); + Put("\n"); EndOpenMP(); } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 1d6fe6c8d4249..49e507feab580 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -3178,6 +3178,10 @@ bool OmpStructureChecker::CheckReductionOperator( const SourceName &realName{name->symbol->GetUltimate().name()}; valid = llvm::is_contained({"max", "min", "iand", "ior", "ieor"}, realName); + if (!valid) { + auto *misc{name->symbol->detailsIf()}; + valid = misc && misc->kind() == MiscDetails::Kind::ConstructName; + } } if (!valid) { context_.Say(source, diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 7a1dfe003e8c2..38888a4dc1461 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -446,6 +446,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPDeclareMapperConstruct &); void Post(const parser::OpenMPDeclareMapperConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPDeclareReductionConstruct &); + void Post(const parser::OpenMPDeclareReductionConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPThreadprivate &); void Post(const parser::OpenMPThreadprivate &) { PopContext(); } @@ -1976,6 +1979,12 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclareMapperConstruct &x) { return true; } +bool OmpAttributeVisitor::Pre( + const parser::OpenMPDeclareReductionConstruct &x) { + PushContext(x.source, llvm::omp::Directive::OMPD_declare_reduction); + return true; +} + bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { PushContext(x.source, llvm::omp::Directive::OMPD_threadprivate); const auto &list{std::get(x.t)}; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index e64abe6b50e78..ff793658f1e06 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1482,6 +1482,15 @@ class OmpVisitor : public virtual DeclarationVisitor { return false; } + bool Pre(const parser::OpenMPDeclareReductionConstruct &x) { + AddOmpSourceRange(x.source); + parser::OmpClauseList emptyList{std::list{}}; + ProcessReductionSpecifier( + std::get>(x.t).value(), + emptyList); + Walk(std::get>(x.t)); + return false; + } bool Pre(const parser::OmpMapClause &); void Post(const parser::OmpBeginLoopDirective &) { @@ -1732,11 +1741,19 @@ void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec, void OmpVisitor::ProcessReductionSpecifier( const parser::OmpReductionSpecifier &spec, const parser::OmpClauseList &clauses) { + BeginDeclTypeSpec(); + const auto &id{std::get(spec.t)}; + if (auto procDes{std::get_if(&id.u)}) { + if (auto *name{std::get_if(&procDes->u)}) { + name->symbol = + &MakeSymbol(*name, MiscDetails{MiscDetails::Kind::ConstructName}); + } + } + EndDeclTypeSpec(); // Creating a new scope in case the combiner expression (or clauses) use // reerved identifiers, like "omp_in". This is a temporary solution until // we deal with these in a more thorough way. PushScope(Scope::Kind::OtherConstruct, nullptr); - Walk(std::get(spec.t)); Walk(std::get(spec.t)); Walk(std::get>(spec.t)); Walk(clauses); diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 index 7a7d28db8d6f5..db50c9ac8ee9d 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 @@ -1,10 +1,10 @@ ! This test checks lowering of OpenMP declare reduction Directive. -// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s +! RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s subroutine declare_red() integer :: my_var - // CHECK: not yet implemented: OpenMPDeclareReductionConstruct + !CHECK: not yet implemented: OpenMPDeclareReductionConstruct !$omp declare reduction (my_red : integer : omp_out = omp_in) initializer (omp_priv = 0) my_var = 0 end subroutine declare_red diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 new file mode 100644 index 0000000000000..a2a3ef9f630ab --- /dev/null +++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 @@ -0,0 +1,21 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s +! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s +!CHECK-LABEL: program main +program main + integer :: my_var + !CHECK: !$OMP DECLARE REDUCTION (my_add_red:INTEGER: omp_out=omp_out+omp_in + !CHECK-NEXT: ) INITIALIZER(OMP_PRIV = 0_4) + + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + my_var = 0 + !$omp parallel reduction (my_add_red : my_var) num_threads(4) + my_var = omp_get_thread_num() + 1 + !$omp end parallel + print *, "sum of thread numbers is ", my_var +end program main + +!PARSE-TREE: OpenMPDeclareReductionConstruct +!PARSE-TREE: OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red' +!PARSE-TREE: DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec +!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out=omp_out+omp_in' +!PARSE-TREE: OmpReductionInitializerClause -> Expr = '0_4' diff --git a/flang/test/Semantics/OpenMP/declarative-directive01.f90 b/flang/test/Semantics/OpenMP/declarative-directive01.f90 index 17dc50b70e542..e8bf605565fad 100644 --- a/flang/test/Semantics/OpenMP/declarative-directive01.f90 +++ b/flang/test/Semantics/OpenMP/declarative-directive01.f90 @@ -2,9 +2,6 @@ ! Check OpenMP declarative directives -!TODO: all internal errors -! enable declare-reduction example after name resolution - ! 2.4 requires subroutine requires_1(a) @@ -88,15 +85,14 @@ end module m2 ! 2.16 declare-reduction -! subroutine declare_red_1() -! use omp_lib -! integer :: my_var -! !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) -! my_var = 0 -! !$omp parallel reduction (my_add_red : my_var) num_threads(4) -! my_var = omp_get_thread_num() + 1 -! !$omp end parallel -! print *, "sum of thread numbers is ", my_var -! end subroutine declare_red_1 +subroutine declare_red_1() + integer :: my_var + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + my_var = 0 + !$omp parallel reduction (my_add_red : my_var) num_threads(4) + my_var = 1 + !$omp end parallel + print *, "sum of thread numbers is ", my_var +end subroutine declare_red_1 end diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90 new file mode 100644 index 0000000000000..8fee79dfc0b7b --- /dev/null +++ b/flang/test/Semantics/OpenMP/declare-reduction.f90 @@ -0,0 +1,11 @@ +! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s + +program main +!CHECK-LABEL: MainProgram scope: main + + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + +!CHECK: my_add_red: Misc ConstructName + +end program main + From 404f94ac7d8c368fba8ad1f97512c26efd5ec965 Mon Sep 17 00:00:00 2001 From: Elvina Yakubova Date: Wed, 19 Feb 2025 11:27:38 +0000 Subject: [PATCH 045/220] [AArch64] Add optional extensions enabled on Grace (#127620) Enable optional ISA extensions on Grace when mcpu=grace is used: sve2-sm4, sve2-aes, sve2-sha3. Grace is no longer an alias, but a separate CPU definition. --- clang/test/Driver/aarch64-mcpu.c | 2 +- .../print-enabled-extensions/aarch64-grace.c | 62 +++++++++++++++++++ llvm/lib/Target/AArch64/AArch64Processors.td | 5 +- llvm/test/CodeGen/AArch64/cpus.ll | 1 + .../TargetParser/TargetParserTest.cpp | 2 +- 5 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 clang/test/Driver/print-enabled-extensions/aarch64-grace.c diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c index 97303510d6881..447ee4bd3a6f9 100644 --- a/clang/test/Driver/aarch64-mcpu.c +++ b/clang/test/Driver/aarch64-mcpu.c @@ -92,7 +92,7 @@ // COBALT-100: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2" // RUN: %clang --target=aarch64 -mcpu=grace -### -c %s 2>&1 | FileCheck -check-prefix=GRACE %s -// GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2" +// GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "grace" // ================== Check whether -mcpu and -mtune accept mixed-case values. // RUN: %clang --target=aarch64 -mcpu=Cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CASE-INSENSITIVE-CA53 %s diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-grace.c b/clang/test/Driver/print-enabled-extensions/aarch64-grace.c new file mode 100644 index 0000000000000..fde6aee468cdc --- /dev/null +++ b/clang/test/Driver/print-enabled-extensions/aarch64-grace.c @@ -0,0 +1,62 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang --target=aarch64 --print-enabled-extensions -mcpu=grace | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s + +// CHECK: Extensions enabled for the given AArch64 target +// CHECK-EMPTY: +// CHECK-NEXT: Architecture Feature(s) Description +// CHECK-NEXT: FEAT_AES, FEAT_PMULL Enable AES support +// CHECK-NEXT: FEAT_AMUv1 Enable Armv8.4-A Activity Monitors extension +// CHECK-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// CHECK-NEXT: FEAT_BF16 Enable BFloat16 Extension +// CHECK-NEXT: FEAT_BTI Enable Branch Target Identification +// CHECK-NEXT: FEAT_CCIDX Enable Armv8.3-A Extend of the CCSIDR number of sets +// CHECK-NEXT: FEAT_CRC32 Enable Armv8.0-A CRC-32 checksum instructions +// CHECK-NEXT: FEAT_CSV2_2 Enable architectural speculation restriction +// CHECK-NEXT: FEAT_DIT Enable Armv8.4-A Data Independent Timing instructions +// CHECK-NEXT: FEAT_DPB Enable Armv8.2-A data Cache Clean to Point of Persistence +// CHECK-NEXT: FEAT_DPB2 Enable Armv8.5-A Cache Clean to Point of Deep Persistence +// CHECK-NEXT: FEAT_DotProd Enable dot product support +// CHECK-NEXT: FEAT_ETE Enable Embedded Trace Extension +// CHECK-NEXT: FEAT_FCMA Enable Armv8.3-A Floating-point complex number support +// CHECK-NEXT: FEAT_FHM Enable FP16 FML instructions +// CHECK-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// CHECK-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// CHECK-NEXT: FEAT_FRINTTS Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int +// CHECK-NEXT: FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions +// CHECK-NEXT: FEAT_FlagM2 Enable alternative NZCV format for floating point comparisons +// CHECK-NEXT: FEAT_I8MM Enable Matrix Multiply Int8 Extension +// CHECK-NEXT: FEAT_JSCVT Enable Armv8.3-A JavaScript FP conversion instructions +// CHECK-NEXT: FEAT_LOR Enable Armv8.1-A Limited Ordering Regions extension +// CHECK-NEXT: FEAT_LRCPC Enable support for RCPC extension +// CHECK-NEXT: FEAT_LRCPC2 Enable Armv8.4-A RCPC instructions with Immediate Offsets +// CHECK-NEXT: FEAT_LSE Enable Armv8.1-A Large System Extension (LSE) atomic instructions +// CHECK-NEXT: FEAT_LSE2 Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules +// CHECK-NEXT: FEAT_MPAM Enable Armv8.4-A Memory system Partitioning and Monitoring extension +// CHECK-NEXT: FEAT_MTE, FEAT_MTE2 Enable Memory Tagging Extension +// CHECK-NEXT: FEAT_NV, FEAT_NV2 Enable Armv8.4-A Nested Virtualization Enchancement +// CHECK-NEXT: FEAT_PAN Enable Armv8.1-A Privileged Access-Never extension +// CHECK-NEXT: FEAT_PAN2 Enable Armv8.2-A PAN s1e1R and s1e1W Variants +// CHECK-NEXT: FEAT_PAuth Enable Armv8.3-A Pointer Authentication extension +// CHECK-NEXT: FEAT_PMUv3 Enable Armv8.0-A PMUv3 Performance Monitors extension +// CHECK-NEXT: FEAT_RAS, FEAT_RASv1p1 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions +// CHECK-NEXT: FEAT_RDM Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions +// CHECK-NEXT: FEAT_RNG Enable Random Number generation instructions +// CHECK-NEXT: FEAT_SB Enable Armv8.5-A Speculation Barrier +// CHECK-NEXT: FEAT_SEL2 Enable Armv8.4-A Secure Exception Level 2 extension +// CHECK-NEXT: FEAT_SHA1, FEAT_SHA256 Enable SHA1 and SHA256 support +// CHECK-NEXT: FEAT_SHA3, FEAT_SHA512 Enable SHA512 and SHA3 support +// CHECK-NEXT: FEAT_SM4, FEAT_SM3 Enable SM3 and SM4 support +// CHECK-NEXT: FEAT_SPE Enable Statistical Profiling extension +// CHECK-NEXT: FEAT_SPECRES Enable Armv8.5-A execution and data prediction invalidation instructions +// CHECK-NEXT: FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit +// CHECK-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// CHECK-NEXT: FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions +// CHECK-NEXT: FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable SVE AES and quadword SVE polynomial multiply instructions +// CHECK-NEXT: FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions +// CHECK-NEXT: FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions +// CHECK-NEXT: FEAT_SVE_SM4 Enable SM4 SVE2 instructions +// CHECK-NEXT: FEAT_TLBIOS, FEAT_TLBIRANGE Enable Armv8.4-A TLB Range and Maintenance instructions +// CHECK-NEXT: FEAT_TRBE Enable Trace Buffer Extension +// CHECK-NEXT: FEAT_TRF Enable Armv8.4-A Trace extension +// CHECK-NEXT: FEAT_UAO Enable Armv8.2-A UAO PState +// CHECK-NEXT: FEAT_VHE Enable Armv8.1-A Virtual Host extension \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index b977b6aaaf619..30d9372e4afd1 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1059,7 +1059,6 @@ def ProcessorFeatures { FeatureJS, FeatureLSE, FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureCCIDX, FeatureRDM]; - list Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, FeatureRandGen, FeaturePAuth, FeatureSM4, FeatureSHA2, @@ -1068,6 +1067,7 @@ def ProcessorFeatures { FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8, FeatureSSBS, FeatureCCIDX, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; + list Grace = !listconcat(NeoverseV2, [FeatureSVE2SM4, FeatureSVEAES, FeatureSVE2SHA3]); // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1151,6 +1151,8 @@ def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4, [TuneX4]>; def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925, [TuneX925]>; +def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace, + [TuneNeoverseV2]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; def : ProcessorModel<"neoverse-n1", NeoverseN1Model, @@ -1166,7 +1168,6 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model, ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; def : ProcessorModel<"neoverse-v2", NeoverseV2Model, ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>; -def : ProcessorAlias<"grace", "neoverse-v2">; def : ProcessorModel<"neoverse-v3", NeoverseV2Model, ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>; def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model, diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll index e9722f348f411..363f0a0598e23 100644 --- a/llvm/test/CodeGen/AArch64/cpus.ll +++ b/llvm/test/CodeGen/AArch64/cpus.ll @@ -18,6 +18,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a77 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a78 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-x1 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=grace 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-e1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-n1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-n2 2>&1 | FileCheck %s diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 93ac7381b02ef..5d771a1a153f7 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1167,6 +1167,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64CPUTestParams("a64fx", "armv8.2-a"), AArch64CPUTestParams("fujitsu-monaka", "armv9.3-a"), AArch64CPUTestParams("carmel", "armv8.2-a"), + AArch64CPUTestParams("grace", "armv9-a"), AArch64CPUTestParams("saphira", "armv8.4-a"), AArch64CPUTestParams("oryon-1", "armv8.6-a")), AArch64CPUTestParams::PrintToStringParamName); @@ -1247,7 +1248,6 @@ TEST_P(AArch64CPUAliasTestFixture, testCPUAlias) { INSTANTIATE_TEST_SUITE_P( AArch64CPUAliasTests, AArch64CPUAliasTestFixture, ::testing::Values(AArch64CPUAliasTestParams({"neoverse-n2", "cobalt-100"}), - AArch64CPUAliasTestParams({"neoverse-v2", "grace"}), AArch64CPUAliasTestParams({"apple-a7", "cyclone", "apple-a8", "apple-a9"}), AArch64CPUAliasTestParams({"apple-a12", "apple-s4", From fadbc33b01d6815bf05d802d1323322262b54d42 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 19 Feb 2025 12:58:55 +0100 Subject: [PATCH 046/220] [lldb] Add LineTable::{upper,lower}_bound (#127519) The motivation is #123622 and the fact that is hard to fine the last line entry in a given range. `FindLineEntryByAddress(range_end-1)` is the best we have, but it's not ideal because it has a magic -1 and that it relies on there existing a line entry at that address (generally, it should be there, but if for some case it isn't, we might end up ignoring the entries that are there (or -- like my incorrect fix in #123622 did -- iterating through the entire line table). What we really want is to get the last entry that exists in the given range. Or, equivalently (and more STL-like) the first entry after that range. This is what these functions do. I've used the STL names since they do pretty much exactly what the standard functions do (the main head-scratcher comes from the fact that our entries represent ranges rather than single values). The functions can also be used to simplify the maze of `if` statements in `FindLineEntryByAddress`, but I'm keeping that as a separate patch. For now, I'm just adding some unit testing for that function to gain more confidence that the patch does not change the function behavior. --------- Co-authored-by: Jonas Devlieghere --- lldb/include/lldb/Symbol/LineTable.h | 13 ++ lldb/source/Symbol/LineTable.cpp | 46 +++- lldb/unittests/Symbol/CMakeLists.txt | 1 + lldb/unittests/Symbol/LineTableTest.cpp | 285 ++++++++++++++++++++++++ 4 files changed, 343 insertions(+), 2 deletions(-) create mode 100644 lldb/unittests/Symbol/LineTableTest.cpp diff --git a/lldb/include/lldb/Symbol/LineTable.h b/lldb/include/lldb/Symbol/LineTable.h index 6d158ab518879..f66081b6ee110 100644 --- a/lldb/include/lldb/Symbol/LineTable.h +++ b/lldb/include/lldb/Symbol/LineTable.h @@ -102,6 +102,19 @@ class LineTable { void GetDescription(Stream *s, Target *target, lldb::DescriptionLevel level); + /// Helper function for line table iteration. \c lower_bound returns the index + /// of the first line entry which ends after the given address (i.e., the + /// first entry which contains the given address or it comes after it). + /// \c upper_bound returns the index of the first line entry which begins on + /// or after the given address (i.e., the entry which would come after the + /// entry containing the given address, if such an entry exists). Functions + /// return GetSize() if there is no such entry. The functions are + /// most useful in combination: iterating from lower_bound(a) to + /// upper_bound(b) returns all line tables which intersect the half-open + /// range [a,b). + uint32_t lower_bound(const Address &so_addr) const; + uint32_t upper_bound(const Address &so_addr) const; + /// Find a line entry that contains the section offset address \a so_addr. /// /// \param[in] so_addr diff --git a/lldb/source/Symbol/LineTable.cpp b/lldb/source/Symbol/LineTable.cpp index 3d2afcdd11997..aae4ab59ff156 100644 --- a/lldb/source/Symbol/LineTable.cpp +++ b/lldb/source/Symbol/LineTable.cpp @@ -123,7 +123,7 @@ void LineTable::InsertSequence(LineSequence *sequence) { entry_collection::iterator end_pos = m_entries.end(); LineTable::Entry::LessThanBinaryPredicate less_than_bp(this); entry_collection::iterator pos = - upper_bound(begin_pos, end_pos, entry, less_than_bp); + std::upper_bound(begin_pos, end_pos, entry, less_than_bp); // We should never insert a sequence in the middle of another sequence if (pos != begin_pos) { @@ -185,6 +185,48 @@ bool LineTable::GetLineEntryAtIndex(uint32_t idx, LineEntry &line_entry) { return false; } +uint32_t LineTable::lower_bound(const Address &so_addr) const { + if (so_addr.GetModule() != m_comp_unit->GetModule()) + return GetSize(); + + Entry search_entry; + search_entry.file_addr = so_addr.GetFileAddress(); + if (search_entry.file_addr == LLDB_INVALID_ADDRESS) + return GetSize(); + + // This is not a typo. upper_bound returns the first entry which definitely + // does not contain this address, which means the entry before it *might* + // contain it -- if it is not a termination entry. + auto pos = + llvm::upper_bound(m_entries, search_entry, Entry::EntryAddressLessThan); + + if (pos != m_entries.begin() && !std::prev(pos)->is_terminal_entry) + --pos; + + return std::distance(m_entries.begin(), pos); +} + +uint32_t LineTable::upper_bound(const Address &so_addr) const { + if (so_addr.GetModule() != m_comp_unit->GetModule()) + return GetSize(); + + Entry search_entry; + search_entry.file_addr = so_addr.GetFileAddress(); + if (search_entry.file_addr == LLDB_INVALID_ADDRESS) + return GetSize(); + + // This is not a typo. lower_bound returns the first entry which starts on or + // after the given address, which is exactly what we want -- *except* if the + // entry is a termination entry (in that case, we want the one after it). + auto pos = + llvm::lower_bound(m_entries, search_entry, Entry::EntryAddressLessThan); + if (pos != m_entries.end() && pos->file_addr == search_entry.file_addr && + pos->is_terminal_entry) + ++pos; + + return std::distance(m_entries.begin(), pos); +} + bool LineTable::FindLineEntryByAddress(const Address &so_addr, LineEntry &line_entry, uint32_t *index_ptr) { @@ -199,7 +241,7 @@ bool LineTable::FindLineEntryByAddress(const Address &so_addr, if (search_entry.file_addr != LLDB_INVALID_ADDRESS) { entry_collection::const_iterator begin_pos = m_entries.begin(); entry_collection::const_iterator end_pos = m_entries.end(); - entry_collection::const_iterator pos = lower_bound( + entry_collection::const_iterator pos = std::lower_bound( begin_pos, end_pos, search_entry, Entry::EntryAddressLessThan); if (pos != end_pos) { if (pos != begin_pos) { diff --git a/lldb/unittests/Symbol/CMakeLists.txt b/lldb/unittests/Symbol/CMakeLists.txt index e1d24357e33db..ab5cecd101833 100644 --- a/lldb/unittests/Symbol/CMakeLists.txt +++ b/lldb/unittests/Symbol/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_unittest(SymbolTests JSONSymbolTest.cpp + LineTableTest.cpp LocateSymbolFileTest.cpp MangledTest.cpp PostfixExpressionTest.cpp diff --git a/lldb/unittests/Symbol/LineTableTest.cpp b/lldb/unittests/Symbol/LineTableTest.cpp new file mode 100644 index 0000000000000..2fa2913f67f9e --- /dev/null +++ b/lldb/unittests/Symbol/LineTableTest.cpp @@ -0,0 +1,285 @@ +//===-- LineTableTest.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Plugins/ObjectFile/ELF/ObjectFileELF.h" +#include "TestingSupport/SubsystemRAII.h" +#include "TestingSupport/TestUtilities.h" +#include "lldb/Core/PluginManager.h" +#include "lldb/Symbol/CompileUnit.h" +#include "lldb/Symbol/SymbolFile.h" +#include "gtest/gtest.h" +#include + +using namespace lldb; +using namespace llvm; +using namespace lldb_private; + +namespace { + +// A fake symbol file class to allow us to create the line table "the right +// way". Pretty much all methods except for GetCompileUnitAtIndex and +// GetNumCompileUnits are stubbed out. +class FakeSymbolFile : public SymbolFile { +public: + /// LLVM RTTI support. + /// \{ + bool isA(const void *ClassID) const override { + return ClassID == &ID || SymbolFile::isA(ClassID); + } + static bool classof(const SymbolFile *obj) { return obj->isA(&ID); } + /// \} + + static void Initialize() { + PluginManager::RegisterPlugin("FakeSymbolFile", "", CreateInstance, + DebuggerInitialize); + } + static void Terminate() { PluginManager::UnregisterPlugin(CreateInstance); } + + void InjectCompileUnit(std::unique_ptr cu_up) { + m_cu_sp = std::move(cu_up); + } + +private: + /// LLVM RTTI support. + static char ID; + + static SymbolFile *CreateInstance(ObjectFileSP objfile_sp) { + return new FakeSymbolFile(std::move(objfile_sp)); + } + static void DebuggerInitialize(Debugger &) {} + + StringRef GetPluginName() override { return "FakeSymbolFile"; } + uint32_t GetAbilities() override { return UINT32_MAX; } + uint32_t CalculateAbilities() override { return UINT32_MAX; } + uint32_t GetNumCompileUnits() override { return 1; } + CompUnitSP GetCompileUnitAtIndex(uint32_t) override { return m_cu_sp; } + Symtab *GetSymtab() override { return nullptr; } + LanguageType ParseLanguage(CompileUnit &) override { return eLanguageTypeC; } + size_t ParseFunctions(CompileUnit &) override { return 0; } + bool ParseLineTable(CompileUnit &) override { return true; } + bool ParseDebugMacros(CompileUnit &) override { return true; } + bool ParseSupportFiles(CompileUnit &, SupportFileList &) override { + return true; + } + size_t ParseTypes(CompileUnit &) override { return 0; } + bool ParseImportedModules(const SymbolContext &, + std::vector &) override { + return false; + } + size_t ParseBlocksRecursive(Function &) override { return 0; } + size_t ParseVariablesForContext(const SymbolContext &) override { return 0; } + Type *ResolveTypeUID(user_id_t) override { return nullptr; } + std::optional + GetDynamicArrayInfoForUID(user_id_t, const ExecutionContext *) override { + return std::nullopt; + } + bool CompleteType(CompilerType &) override { return true; } + uint32_t ResolveSymbolContext(const Address &, SymbolContextItem, + SymbolContext &) override { + return 0; + } + void GetTypes(SymbolContextScope *, TypeClass, TypeList &) override {} + Expected GetTypeSystemForLanguage(LanguageType) override { + return createStringError(std::errc::not_supported, ""); + } + const ObjectFile *GetObjectFile() const override { + return m_objfile_sp.get(); + } + ObjectFile *GetObjectFile() override { return m_objfile_sp.get(); } + ObjectFile *GetMainObjectFile() override { return m_objfile_sp.get(); } + void SectionFileAddressesChanged() override {} + void Dump(Stream &) override {} + uint64_t GetDebugInfoSize(bool) override { return 0; } + bool GetDebugInfoIndexWasLoadedFromCache() const override { return false; } + void SetDebugInfoIndexWasLoadedFromCache() override {} + bool GetDebugInfoIndexWasSavedToCache() const override { return false; } + void SetDebugInfoIndexWasSavedToCache() override {} + bool GetDebugInfoHadFrameVariableErrors() const override { return false; } + void SetDebugInfoHadFrameVariableErrors() override {} + TypeSP MakeType(user_id_t, ConstString, std::optional, + SymbolContextScope *, user_id_t, Type::EncodingDataType, + const Declaration &, const CompilerType &, Type::ResolveState, + uint32_t) override { + return nullptr; + } + TypeSP CopyType(const TypeSP &) override { return nullptr; } + + FakeSymbolFile(ObjectFileSP objfile_sp) + : m_objfile_sp(std::move(objfile_sp)) {} + + ObjectFileSP m_objfile_sp; + CompUnitSP m_cu_sp; +}; + +struct FakeModuleFixture { + TestFile file; + ModuleSP module_sp; + SectionSP text_sp; + LineTable *line_table; +}; + +class LineTableTest : public testing::Test { + SubsystemRAII subsystems; +}; + +class LineSequenceBuilder { +public: + std::vector> Build() { + return std::move(m_sequences); + } + enum Terminal : bool { Terminal = true }; + void Entry(addr_t addr, bool terminal = false) { + LineTable::AppendLineEntryToSequence( + m_seq_up.get(), addr, /*line=*/1, /*column=*/0, + /*file_idx=*/0, + /*is_start_of_statement=*/false, /*is_start_of_basic_block=*/false, + /*is_prologue_end=*/false, /*is_epilogue_begin=*/false, terminal); + if (terminal) { + m_sequences.push_back(std::move(m_seq_up)); + m_seq_up = LineTable::CreateLineSequenceContainer(); + } + } + +private: + std::vector> m_sequences; + std::unique_ptr m_seq_up = + LineTable::CreateLineSequenceContainer(); +}; + +} // namespace + +char FakeSymbolFile::ID; + +static llvm::Expected +CreateFakeModule(std::vector> line_sequences) { + Expected file = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_386 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x0010 + Address: 0x0000 + Size: 0x1000 +)"); + if (!file) + return file.takeError(); + + auto module_sp = std::make_shared(file->moduleSpec()); + SectionSP text_sp = + module_sp->GetSectionList()->FindSectionByName(ConstString(".text")); + if (!text_sp) + return createStringError("No .text"); + + auto cu_up = std::make_unique(module_sp, /*user_data=*/nullptr, + /*support_file_sp=*/nullptr, + /*uid=*/0, eLanguageTypeC, + /*is_optimized=*/eLazyBoolNo); + LineTable *line_table = new LineTable(cu_up.get(), std::move(line_sequences)); + cu_up->SetLineTable(line_table); + cast(module_sp->GetSymbolFile()) + ->InjectCompileUnit(std::move(cu_up)); + + return FakeModuleFixture{std::move(*file), std::move(module_sp), + std::move(text_sp), line_table}; +} + +TEST_F(LineTableTest, LowerAndUpperBound) { + LineSequenceBuilder builder; + builder.Entry(0); + builder.Entry(10); + builder.Entry(20, LineSequenceBuilder::Terminal); + builder.Entry(20); // Starts right after the previous sequence. + builder.Entry(30, LineSequenceBuilder::Terminal); + builder.Entry(40); // Gap after the previous sequence. + builder.Entry(50, LineSequenceBuilder::Terminal); + + llvm::Expected fixture = CreateFakeModule(builder.Build()); + ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded()); + + LineTable *table = fixture->line_table; + + auto make_addr = [&](addr_t addr) { return Address(fixture->text_sp, addr); }; + + // Both functions return the same value for boundary values. This way the + // index range for e.g. [0,10) is [0,1). + EXPECT_EQ(table->lower_bound(make_addr(0)), 0u); + EXPECT_EQ(table->upper_bound(make_addr(0)), 0u); + EXPECT_EQ(table->lower_bound(make_addr(10)), 1u); + EXPECT_EQ(table->upper_bound(make_addr(10)), 1u); + EXPECT_EQ(table->lower_bound(make_addr(20)), 3u); + EXPECT_EQ(table->upper_bound(make_addr(20)), 3u); + + // In case there's no "real" entry at this address, they return the first real + // entry. + EXPECT_EQ(table->lower_bound(make_addr(30)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(30)), 5u); + + EXPECT_EQ(table->lower_bound(make_addr(40)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(40)), 5u); + + // For in-between values, their result differs by one. [9,19) maps to [0,2) + // because the first two entries contain a part of that range. + EXPECT_EQ(table->lower_bound(make_addr(9)), 0u); + EXPECT_EQ(table->upper_bound(make_addr(9)), 1u); + EXPECT_EQ(table->lower_bound(make_addr(19)), 1u); + EXPECT_EQ(table->upper_bound(make_addr(19)), 2u); + EXPECT_EQ(table->lower_bound(make_addr(29)), 3u); + EXPECT_EQ(table->upper_bound(make_addr(29)), 4u); + + // In a gap, they both return the first entry after the gap. + EXPECT_EQ(table->upper_bound(make_addr(39)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(39)), 5u); + + // And if there's no such entry, they return the size of the list. + EXPECT_EQ(table->lower_bound(make_addr(50)), table->GetSize()); + EXPECT_EQ(table->upper_bound(make_addr(50)), table->GetSize()); + EXPECT_EQ(table->lower_bound(make_addr(59)), table->GetSize()); + EXPECT_EQ(table->upper_bound(make_addr(59)), table->GetSize()); +} + +TEST_F(LineTableTest, FindLineEntryByAddress) { + LineSequenceBuilder builder; + builder.Entry(0); + builder.Entry(10); + builder.Entry(20, LineSequenceBuilder::Terminal); + builder.Entry(20); // Starts right after the previous sequence. + builder.Entry(30, LineSequenceBuilder::Terminal); + builder.Entry(40); // Gap after the previous sequence. + builder.Entry(50, LineSequenceBuilder::Terminal); + + llvm::Expected fixture = CreateFakeModule(builder.Build()); + ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded()); + + LineTable *table = fixture->line_table; + + auto find = [&](addr_t addr) -> std::tuple { + LineEntry entry; + if (!table->FindLineEntryByAddress(Address(fixture->text_sp, addr), entry)) + return {LLDB_INVALID_ADDRESS, LLDB_INVALID_ADDRESS, false}; + return {entry.range.GetBaseAddress().GetFileAddress(), + entry.range.GetByteSize(), + static_cast(entry.is_terminal_entry)}; + }; + + EXPECT_THAT(find(0), testing::FieldsAre(0, 10, false)); + EXPECT_THAT(find(9), testing::FieldsAre(0, 10, false)); + EXPECT_THAT(find(10), testing::FieldsAre(10, 10, false)); + EXPECT_THAT(find(19), testing::FieldsAre(10, 10, false)); + EXPECT_THAT(find(20), testing::FieldsAre(20, 10, false)); + EXPECT_THAT(find(30), testing::FieldsAre(LLDB_INVALID_ADDRESS, + LLDB_INVALID_ADDRESS, false)); + EXPECT_THAT(find(40), testing::FieldsAre(40, 10, false)); + EXPECT_THAT(find(50), testing::FieldsAre(LLDB_INVALID_ADDRESS, + LLDB_INVALID_ADDRESS, false)); +} From c0a763d3ef26148af06911eda059eec303adfa39 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 19 Feb 2025 09:11:38 -0300 Subject: [PATCH 047/220] [NFC][MLIR] Make file-local cl::opt global variables static (#126714) This is per style-guide: make file-scope symbol static whenever possible. Fix #125983. --- mlir/tools/mlir-rewrite/mlir-rewrite.cpp | 8 ++++---- mlir/tools/mlir-runner/mlir-runner.cpp | 2 +- mlir/tools/mlir-tblgen/DialectGen.cpp | 2 +- mlir/tools/mlir-tblgen/OpDocGen.cpp | 4 ++-- mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp index bbb6bd6617a13..87df9e19d1842 100644 --- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp +++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp @@ -348,11 +348,11 @@ static mlir::RewriterRegistration rewriteMarkRanges("mark-ranges", "Indicate ranges parsed", markRanges); int main(int argc, char **argv) { - static llvm::cl::opt inputFilename( - llvm::cl::Positional, llvm::cl::desc(""), - llvm::cl::init("-")); + llvm::cl::opt inputFilename(llvm::cl::Positional, + llvm::cl::desc(""), + llvm::cl::init("-")); - static llvm::cl::opt outputFilename( + llvm::cl::opt outputFilename( "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"), llvm::cl::init("-")); diff --git a/mlir/tools/mlir-runner/mlir-runner.cpp b/mlir/tools/mlir-runner/mlir-runner.cpp index 7e8793de03ead..932c9f6cc9fdc 100644 --- a/mlir/tools/mlir-runner/mlir-runner.cpp +++ b/mlir/tools/mlir-runner/mlir-runner.cpp @@ -32,7 +32,7 @@ using namespace mlir; // TODO: Consider removing this linking functionality from the SPIR-V CPU Runner // flow in favour of a more proper host/device split like other runners. // https://github.com/llvm/llvm-project/issues/115348 -llvm::cl::opt LinkNestedModules( +static llvm::cl::opt LinkNestedModules( "link-nested-modules", llvm::cl::desc("Link two nested MLIR modules into a single LLVM IR module. " "Useful if both the host and device code can be run on the " diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp index 414cad5e1dcc2..6cf71d2bb0174 100644 --- a/mlir/tools/mlir-tblgen/DialectGen.cpp +++ b/mlir/tools/mlir-tblgen/DialectGen.cpp @@ -34,7 +34,7 @@ using llvm::Record; using llvm::RecordKeeper; static llvm::cl::OptionCategory dialectGenCat("Options for -gen-dialect-*"); -llvm::cl::opt +static llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::CommaSeparated); diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 43d406e4340f7..dbaad84cda5d6 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -44,11 +44,11 @@ using mlir::tblgen::Operator; //===----------------------------------------------------------------------===// static cl::OptionCategory docCat("Options for -gen-(attrdef|typedef|enum|op|dialect)-doc"); -cl::opt +static cl::opt stripPrefix("strip-prefix", cl::desc("Strip prefix of the fully qualified names"), cl::init("::mlir::"), cl::cat(docCat)); -cl::opt allowHugoSpecificFeatures( +static cl::opt allowHugoSpecificFeatures( "allow-hugo-specific-features", cl::desc("Allows using features specific to Hugo"), cl::init(false), cl::cat(docCat)); diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp index c9f6dd35de44e..c2ad09ffaaed5 100644 --- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp +++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp @@ -36,7 +36,7 @@ using namespace mlir; using tblgen::NamedTypeConstraint; static llvm::cl::OptionCategory dialectGenCat("Options for -gen-irdl-dialect"); -llvm::cl::opt +static llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::Required); From 0f472e93d504fc03a4fcdbd898ab9c7973eeca78 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 19 Feb 2025 19:14:18 +0700 Subject: [PATCH 048/220] AMDGPU: Avoid double attribute lookup for register count attributes (#127782) --- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index b5e8e246825c7..55af5826e90d0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -422,10 +422,10 @@ unsigned GCNSubtarget::getBaseMaxNumSGPRs( // Check if maximum number of SGPRs was explicitly requested using // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + if (Requested != MaxNumSGPRs) { // Make sure requested value does not violate subtarget's specifications. if (Requested && (Requested <= ReservedNumSGPRs)) Requested = 0; @@ -504,10 +504,9 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs( // Check if maximum number of VGPRs was explicitly requested using // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); - + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); + if (Requested != MaxNumVGPRs) { if (hasGFX90AInsts()) Requested *= 2; From 73d067977b1061689c88621cfc65e024b6d6a08d Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 19 Feb 2025 12:21:18 +0000 Subject: [PATCH 049/220] [libclc] Clean up directory search procedure (#127783) During a recent change, the build system accidentally dropped the (theoretical) support for the CLC builtins library to build target-specific builtins from the 'amdgpu' directory, due to a change in variable names. This functionality wasn't being used but was spotted during another code review. This commit takes the opportunity to clean up and better document the code that manages the list of directories to search for builtin implementations. While fixing this, some references to now-removed SOURCES files were discovered which have been cleaned up. --- libclc/CMakeLists.txt | 32 +++++++++++++++++----------- libclc/cmake/modules/AddLibclc.cmake | 11 +++++----- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 05a2b87a56bc4..5a9a26c44f368 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -20,16 +20,12 @@ include( GNUInstallDirs ) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS amdgcn-amdhsa/lib/SOURCES; amdgcn/lib/SOURCES; - amdgcn-mesa3d/lib/SOURCES; amdgpu/lib/SOURCES; clspv/lib/SOURCES; - clspv64/lib/SOURCES; generic/lib/SOURCES; - ptx/lib/SOURCES; ptx-nvidiacl/lib/SOURCES; r600/lib/SOURCES; spirv/lib/SOURCES; - spirv64/lib/SOURCES; # CLC internal libraries clc/lib/generic/SOURCES; ) @@ -280,11 +276,6 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( opencl_dirs ) - if ( NOT ${ARCH} STREQUAL spirv AND NOT ${ARCH} STREQUAL spirv64 AND - NOT ${ARCH} STREQUAL clspv AND NOT ${ARCH} STREQUAL clspv64) - LIST( APPEND opencl_dirs generic ) - endif() - if( ${ARCH} STREQUAL r600 OR ${ARCH} STREQUAL amdgcn ) list( APPEND opencl_dirs amdgpu ) endif() @@ -302,8 +293,25 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( DARCH ${ARCH} ) endif() + # Append a variety of target- and triple-based directories to search, + # increasing in specificity. + list( APPEND opencl_dirs ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} ) + + # The 'generic' directory contains all of the generic implementations of the + # builtins. It is included first so it has the lowest search priority, + # allowing targets to override builtins based on file names found later in + # the list of search directories. + # CLC builds all builtins for all targets, so unconditionally prepend the + # 'generic' directory. + set( clc_dirs generic ${opencl_dirs} ) + # Some OpenCL targets don't build all builtins, in which case they don't want + # the 'generic' directory. Otherwise, prepend the 'generic' directory. + if ( NOT ARCH STREQUAL spirv AND NOT ARCH STREQUAL spirv64 AND + NOT ARCH STREQUAL clspv AND NOT ARCH STREQUAL clspv64) + list( PREPEND opencl_dirs generic ) + endif() + set( clc_lib_files ) - set( clc_dirs ${dirs} generic ) if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) set( clc_gen_files clc-clspv-convert.cl ) @@ -315,7 +323,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) clc_lib_files CLC_INTERNAL LIB_ROOT_DIR clc - DIRS ${clc_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${clc_dirs} ) set( opencl_lib_files ) @@ -334,7 +342,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) libclc_configure_lib_source( opencl_lib_files - DIRS ${opencl_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${opencl_dirs} ) foreach( d ${${t}_devices} ) diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 40e31e0ba4f45..911559ff4bfa9 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -402,7 +402,8 @@ endfunction(add_libclc_builtin_set) # directory. If not provided, is set to '.'. # * DIRS ... # List of directories under LIB_ROOT_DIR to walk over searching for SOURCES -# files +# files. Directories earlier in the list have lower priority than +# subsequent ones. function(libclc_configure_lib_source LIB_FILE_LIST) cmake_parse_arguments(ARG "CLC_INTERNAL" @@ -417,7 +418,7 @@ function(libclc_configure_lib_source LIB_FILE_LIST) # Enumerate SOURCES* files set( source_list ) - foreach( l ${ARG_DIRS} ) + foreach( l IN LISTS ARG_DIRS ) foreach( s "SOURCES" "SOURCES_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}" ) if( ARG_CLC_INTERNAL ) file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/lib/${l}/${s} file_loc ) @@ -425,10 +426,10 @@ function(libclc_configure_lib_source LIB_FILE_LIST) file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/${l}/lib/${s} file_loc ) endif() file( TO_CMAKE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${file_loc} loc ) - # Prepend the location to give higher priority to - # specialized implementation + # Prepend the location to give higher priority to the specialized + # implementation if( EXISTS ${loc} ) - set( source_list ${file_loc} ${source_list} ) + list( PREPEND source_list ${file_loc} ) endif() endforeach() endforeach() From 3ce2e4df5d45ec90ac544c32fa63042be2151d04 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Wed, 19 Feb 2025 17:55:25 +0530 Subject: [PATCH 050/220] [NVPTX] Add tcgen05.cp/shift intrinsics (#127669) This patch adds intrinsics for tcgen05.cp and tcgen05.shift instructions. lit tests are added and verified with a ptxas-12.8 executable. Docs are updated in the NVPTXUsage.rst file. Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 87 ++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 32 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 42 +++ llvm/test/CodeGen/NVPTX/tcgen05-cp.ll | 348 +++++++++++++++++++++++ llvm/test/CodeGen/NVPTX/tcgen05-shift.ll | 23 ++ 5 files changed, 532 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-cp.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-shift.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 8550af456e961..675b458c41e7b 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1183,6 +1183,93 @@ operations. For more information, refer to the PTX ISA ``_. +'``llvm.nvvm.tcgen05.shift``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.shift.{cg1/cg2}``' intrinsics correspond to +the ``tcgen05.shift.{cg1/cg2}`` PTX instructions. The ``tcgen05.shift`` +is an asynchronous instruction which initiates the shifting of 32-byte +elements downwards across all the rows, except the last, by one row. +The address operand ``%tmem_addr`` specifies the base address of the +matrix in the Tensor Memory whose rows must be down shifted. + +For more information, refer to the PTX ISA +``_. + +'``llvm.nvvm.tcgen05.cp``' +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.cp.4x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.cp.{shape}.{src_fmt}.{cg1/cg2}``' intrinsics +correspond to the ``tcgen05.cp.*`` family of PTX instructions. +The ``tcgen05.cp`` instruction initiates an asynchronous copy operation from +shared memory to the location specified by ``%tmem_addr`` in Tensor Memory. +The 64-bit register operand ``%sdesc`` is the matrix descriptor representing +the source matrix in shared memory that needs to be copied. + +The valid shapes for the copy operation are: +{128x256b, 4x256b, 128x128b, 64x128b_warpx2_02_13, 64x128b_warpx2_01_23, 32x128b_warpx4}. + +Shapes ``64x128b`` and ``32x128b`` require dedicated multicast qualifiers, +which are appended to the corresponding intrinsic names. + +Optionally, the data can be decompressed from the source format in the shared memory +to the destination format in Tensor Memory during the copy operation. Currently, +only ``.b8x16`` is supported as destination format. The valid source formats are +``.b6x16_p32`` and ``.b4x16_p64``. + +When the source format is ``.b6x16_p32``, a contiguous set of 16 elements of 6-bits +each followed by four bytes of padding (``_p32``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +When the source format is ``.b4x16_p64``, a contiguous set of 16 elements of 4-bits +each followed by eight bytes of padding (``_p64``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +For more information on the decompression schemes, refer to the PTX ISA +``_. + +For more information on the tcgen05.cp instruction, refer to the PTX ISA +``_. Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7ef270f3256a6..c32bf0318b5d6 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -55,6 +55,14 @@ def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr // MISC // +// Helper class that concatenates list elements with +// a given separator 'sep' and returns the result. +// Handles empty strings. +class StrJoin str_list> { + string ret = !foldl("", str_list, a, b, + !if(!eq(a, ""), b, !if(!eq(b, ""), a, !strconcat(a, sep, b)))); +} + // Helper class that represents a 'fragment' of an NVPTX *MMA instruction. // Geom: mnk. E.g. m8n32k16 // Frag: [a|b|c|d] ([x1|x2|x4] for ldmatrix) @@ -5140,6 +5148,11 @@ foreach cta_group = ["cg1", "cg2"] in { [llvm_shared_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>]>; + + def int_nvvm_tcgen05_shift_down_ # cta_group : Intrinsic<[], + [llvm_tmem_ptr_ty], // tmem_addr + [IntrConvergent, IntrArgMemOnly, + NoCapture>]>; } // Tcgen05 wait_ld/st intrinsics @@ -5154,4 +5167,23 @@ def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [], def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; +// Tcgen05 cp intrinsics +foreach cta_group = ["cg1", "cg2"] in { + foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + foreach shape = ["128x256b", "4x256b", "128x128b", + "64x128b_warpx2_02_13", + "64x128b_warpx2_01_23", + "32x128b_warpx4"] in { + defvar intr_suffix = StrJoin<"_", [shape, src_fmt, cta_group]>.ret; + defvar name_suffix = StrJoin<".", [shape, src_fmt, cta_group]>.ret; + + def int_nvvm_tcgen05_cp_ # intr_suffix : Intrinsic<[], + [llvm_tmem_ptr_ty, // tmem_addr + llvm_i64_ty], // smem descriptor + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>], + "llvm.nvvm.tcgen05.cp." # name_suffix>; + } + } +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f20502521829e..ed7963f35a7c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7704,6 +7704,48 @@ defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; +multiclass TCGEN05_SHIFT_INTR { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr), + !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"), + [(Intr Int32Regs:$tmem_addr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; +defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; + +multiclass TCGEN05_CP_INTR { + defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16"); + defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret; + defvar fmt_intr = StrJoin<"_", [src_fmt]>.ret; + + defvar shape_mc_asm = StrJoin<".", [shape, mc]>.ret; + defvar shape_mc_intr = !subst("::", "_", !subst(".", "_", shape_mc_asm)); + + defvar intr_prefix = StrJoin<"_", ["int_nvvm_tcgen05_cp", shape_mc_intr, fmt_intr]>.ret; + defvar IntrCG1 = !cast(intr_prefix # "_cg1"); + defvar IntrCG2 = !cast(intr_prefix # "_cg2"); + + def NAME # _cg1 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; + def NAME # _cg2 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; +} + +foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + defm TCGEN05_CP_128x256b # src_fmt : TCGEN05_CP_INTR<"128x256b", src_fmt>; + defm TCGEN05_CP_4x256b # src_fmt : TCGEN05_CP_INTR<"4x256b", src_fmt>; + defm TCGEN05_CP_128x128b # src_fmt : TCGEN05_CP_INTR<"128x128b", src_fmt>; + defm TCGEN05_CP_64x128_1 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::02_13">; + defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">; + defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">; +} } // isConvergent let hasSideEffects = 1 in { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll new file mode 100644 index 0000000000000..50dc93325c286 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1 +define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2 +define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128 +define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + + +; CHECK-LABEL: test_tcgen05_cp_128x128b +define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x256b +define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b +define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b6x16_p32 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32 +define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32 +define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32 +define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32 +define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32 +define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32 +define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b4x16_p64 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64 +define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64 +define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64 +define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64 +define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64 +define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64 +define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll new file mode 100644 index 0000000000000..13a45b9d86dcf --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) +declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +; CHECK-LABEL: test_tcgen05_shift +define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) { +; CHECK-LABEL: test_tcgen05_shift( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_shift_param_0]; +; CHECK-NEXT: tcgen05.shift.cta_group::1.down [%r1]; +; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + call void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + + ret void +} From c6a907ac46869e3552f3a62eef08ba5548676d85 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Wed, 19 Feb 2025 13:28:42 +0100 Subject: [PATCH 051/220] [mlir][dlti] Fix query keys preallocation (#127786) Fixes upfront space allocation after #126716 --- mlir/lib/Dialect/DLTI/DLTI.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp index b057554c40d8c..70e05cb4cb383 100644 --- a/mlir/lib/Dialect/DLTI/DLTI.cpp +++ b/mlir/lib/Dialect/DLTI/DLTI.cpp @@ -571,7 +571,8 @@ FailureOr dlti::query(Operation *op, ArrayRef keys, return failure(); MLIRContext *ctx = op->getContext(); - SmallVector entryKeys(keys.size()); + SmallVector entryKeys; + entryKeys.reserve(keys.size()); for (StringRef key : keys) entryKeys.push_back(StringAttr::get(ctx, key)); From 2c8b1248513624e89b510397224f0f405116f3d3 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 19 Feb 2025 08:21:56 -0500 Subject: [PATCH 052/220] [libc++] Guard include of with __has_include (#127691) Some configurations define __AMDGPU__ or __NVPTX__ on platforms that don't provide , such as CUDA on Mac. --- libcxx/include/__configuration/platform.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h index cff99376ee24b..8d0f8f63f5213 100644 --- a/libcxx/include/__configuration/platform.h +++ b/libcxx/include/__configuration/platform.h @@ -32,12 +32,14 @@ // Need to detect which libc we're using if we're on Linux. #if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__) -# include -# if defined(__GLIBC_PREREQ) -# define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) -# else -# define _LIBCPP_GLIBC_PREREQ(a, b) 0 -# endif // defined(__GLIBC_PREREQ) +# if __has_include() +# include +# if defined(__GLIBC_PREREQ) +# define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) +# else +# define _LIBCPP_GLIBC_PREREQ(a, b) 0 +# endif // defined(__GLIBC_PREREQ) +# endif #endif #ifndef __BYTE_ORDER__ From 43e83b9405ec4eaa23919d24f0aaaefdd989b534 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 19 Feb 2025 14:29:52 +0100 Subject: [PATCH 053/220] [clang][bytecode] Fix allocating primitive arrays of unknown bound (#127788) --- clang/lib/AST/ByteCode/Compiler.cpp | 3 ++- clang/test/AST/ByteCode/new-delete.cpp | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 503c58a67adeb..a35aa9471a73d 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3397,7 +3397,8 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { CtorFunc = getFunction(CE->getConstructor()); if (!CtorFunc) return false; - } + } else if (!DynamicInit) + DynamicInit = Init; LabelTy EndLabel = this->getLabel(); LabelTy StartLabel = this->getLabel(); diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 7e5f6ab8815ea..5be1bb944c18c 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -907,6 +907,12 @@ namespace IncompleteArray { return c; } static_assert(test4() == 12); + + + constexpr char *f(int n) { + return new char[n](); + } + static_assert((delete[] f(2), true)); } namespace NonConstexprArrayCtor { From 7f69a399df384c86428d0c97e3afbc8146324226 Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Wed, 19 Feb 2025 21:31:56 +0800 Subject: [PATCH 054/220] [libc++] Deprecate and remove member types of `hash` in `` (#127758) These member types were deprecated in C++17 by P0174R2 and removed in C++20 by P0619R4, but the changes in `` seem missing. Drive-by: Replace one `_NOEXCEPT` with `noexcept` as the `hash` specialization is C++17-and-later only. --- libcxx/include/__variant/monostate.h | 8 +++-- libcxx/include/variant | 8 +++-- .../variant/variant.hash/hash.depr.verify.cpp | 33 +++++++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp diff --git a/libcxx/include/__variant/monostate.h b/libcxx/include/__variant/monostate.h index c5d2dacaf4205..b29bbdf5cdbe4 100644 --- a/libcxx/include/__variant/monostate.h +++ b/libcxx/include/__variant/monostate.h @@ -49,10 +49,12 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(monostate, monostate) noe template <> struct _LIBCPP_TEMPLATE_VIS hash { - using argument_type = monostate; - using result_type = size_t; +# if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS) + using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = monostate; + using result_type _LIBCPP_DEPRECATED_IN_CXX17 = size_t; +# endif - inline _LIBCPP_HIDE_FROM_ABI result_type operator()(const argument_type&) const _NOEXCEPT { + inline _LIBCPP_HIDE_FROM_ABI size_t operator()(const monostate&) const noexcept { return 66740831; // return a fundamentally attractive random value. } }; diff --git a/libcxx/include/variant b/libcxx/include/variant index 3786d9524020b..9998d4a457715 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1585,10 +1585,12 @@ swap(variant<_Types...>& __lhs, template struct _LIBCPP_TEMPLATE_VIS hash< __enable_hash_helper, remove_const_t<_Types>...>> { - using argument_type = variant<_Types...>; - using result_type = size_t; +# if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS) + using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = variant<_Types...>; + using result_type _LIBCPP_DEPRECATED_IN_CXX17 = size_t; +# endif - _LIBCPP_HIDE_FROM_ABI result_type operator()(const argument_type& __v) const { + _LIBCPP_HIDE_FROM_ABI size_t operator()(const variant<_Types...>& __v) const { using __variant_detail::__visitation::__variant; size_t __res = __v.valueless_by_exception() diff --git a/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp b/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp new file mode 100644 index 0000000000000..9b22cbda9f345 --- /dev/null +++ b/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++17 + +#include + +#include "test_macros.h" + +using A1 [[maybe_unused]] = std::hash>::argument_type; +using R1 [[maybe_unused]] = std::hash>::result_type; +#if TEST_STD_VER >= 20 +// expected-error@-3 {{no type named 'argument_type' in 'std::hash>'}} +// expected-error@-3 {{no type named 'result_type' in 'std::hash>'}} +#else +// expected-warning@-6 {{'argument_type' is deprecated}} +// expected-warning@-6 {{'result_type' is deprecated}} +#endif + +using A2 [[maybe_unused]] = std::hash::argument_type; +using R2 [[maybe_unused]] = std::hash::result_type; +#if TEST_STD_VER >= 20 +// expected-error@-3 {{no type named 'argument_type' in 'std::hash'}} +// expected-error@-3 {{no type named 'result_type' in 'std::hash'}} +#else +// expected-warning@-6 {{'argument_type' is deprecated}} +// expected-warning@-6 {{'result_type' is deprecated}} +#endif From 1ac0db44fdb4de5fa7ec637c297bd0fc824057e2 Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Wed, 19 Feb 2025 08:42:38 -0500 Subject: [PATCH 055/220] [NFC] using isUndef() instead of getOpcode() == ISD::UNDEF (#127713) [NFC] using isUndef() instead of getOpcode() == ISD::UNDEF --- llvm/include/llvm/CodeGen/SelectionDAG.h | 4 ++-- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 ++++----- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 22 +++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index cf8e4a3d2513b..aa0dfbe666cde 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -873,7 +873,7 @@ class SelectionDAG { /// for integers, a type wider than) VT's element type. SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) { // VerifySDNode (via InsertNode) checks BUILD_VECTOR later. - if (Op.getOpcode() == ISD::UNDEF) { + if (Op.isUndef()) { assert((VT.getVectorElementType() == Op.getValueType() || (VT.isInteger() && VT.getVectorElementType().bitsLE(Op.getValueType()))) && @@ -889,7 +889,7 @@ class SelectionDAG { // Return a splat ISD::SPLAT_VECTOR node, consisting of Op splatted to all // elements. SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op) { - if (Op.getOpcode() == ISD::UNDEF) { + if (Op.isUndef()) { assert((VT.getVectorElementType() == Op.getValueType() || (VT.isInteger() && VT.getVectorElementType().bitsLE(Op.getValueType()))) && diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bc7cdf38dbc2a..f52447b86a7e4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16145,7 +16145,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // also recursively replace t184 by t150. SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo); // Don't replace every single UNDEF everywhere with frozen UNDEF, though. - if (MaybePoisonOperand.getOpcode() == ISD::UNDEF) + if (MaybePoisonOperand.isUndef()) continue; // First, freeze each offending operand. SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand); @@ -16173,7 +16173,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { SmallVector Ops(N0->ops()); // Special-handle ISD::UNDEF, each single one of them can be it's own thing. for (SDValue &Op : Ops) { - if (Op.getOpcode() == ISD::UNDEF) + if (Op.isUndef()) Op = DAG.getFreeze(Op); } @@ -24289,7 +24289,7 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { if (ISD::BITCAST == Op.getOpcode() && !Op.getOperand(0).getValueType().isVector()) Ops.push_back(Op.getOperand(0)); - else if (ISD::UNDEF == Op.getOpcode()) + else if (Op.isUndef()) Ops.push_back(DAG.getNode(ISD::UNDEF, DL, SVT)); else return SDValue(); @@ -24684,7 +24684,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) auto IsBuildVectorOrUndef = [](const SDValue &Op) { - return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); + return Op.isUndef() || ISD::BUILD_VECTOR == Op.getOpcode(); }; if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { SmallVector Opnds; @@ -24708,7 +24708,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { EVT OpVT = Op.getValueType(); unsigned NumElts = OpVT.getVectorNumElements(); - if (ISD::UNDEF == Op.getOpcode()) + if (Op.isUndef()) Opnds.append(NumElts, DAG.getUNDEF(MinVT)); if (ISD::BUILD_VECTOR == Op.getOpcode()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 80c2de1d99542..de092cba333c2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6285,7 +6285,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, Flags.setNonNeg(N1->getFlags().hasNonNeg()); return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); break; @@ -6305,7 +6305,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, Flags.setNonNeg(N1->getFlags().hasNonNeg()); return getNode(ISD::ZERO_EXTEND, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); @@ -6347,7 +6347,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // (ext (trunc x)) -> x @@ -6382,7 +6382,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0)); return N1.getOperand(0); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); if (OpOpcode == ISD::VSCALE && !NewNodesMustHaveLegalTypes) return getVScale(DL, VT, @@ -6400,14 +6400,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::ABS: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid ABS!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getConstant(0, DL, VT); break; case ISD::BSWAP: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BSWAP!"); assert((VT.getScalarSizeInBits() % 16 == 0) && "BSWAP types must be a multiple of 16 bits!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // bswap(bswap(X)) -> X. if (OpOpcode == ISD::BSWAP) @@ -6415,7 +6415,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::BITREVERSE: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BITREVERSE!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::BITCAST: @@ -6424,7 +6424,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == N1.getValueType()) return N1; // noop conversion. if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) return getNode(ISD::BITCAST, DL, VT, N1.getOperand(0)); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::SCALAR_TO_VECTOR: @@ -6434,7 +6434,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N1.getValueType().isInteger() && VT.getVectorElementType().bitsLE(N1.getValueType()))) && "Illegal SCALAR_TO_VECTOR node!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined. if (OpOpcode == ISD::EXTRACT_VECTOR_ELT && @@ -6445,7 +6445,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::FNEG: // Negation of an unknown bag of bits is still completely undefined. - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); if (OpOpcode == ISD::FNEG) // --X -> X @@ -13364,7 +13364,7 @@ void BuildVectorSDNode::recastRawBits(bool IsLittleEndian, bool BuildVectorSDNode::isConstant() const { for (const SDValue &Op : op_values()) { unsigned Opc = Op.getOpcode(); - if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP) + if (!Op.isUndef() && Opc != ISD::Constant && Opc != ISD::ConstantFP) return false; } return true; From 01d0793a69ad4c5c54815138ebe945b5cdce2aca Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 19 Feb 2025 14:51:19 +0100 Subject: [PATCH 056/220] [LAA] Make Ptr argument optional in isNoWrap. (#127410) Update isNoWrap to make the IR Ptr argument optional. This allows using isNoWrap when dealing with things like pointer-selects, where a select is translated to multiple pointer SCEV expressions, but there is no IR value that can be used. We don't try to retrieve pointer values for the pointer SCEVs and using info from the IR would not be safe. For example, we cannot use inbounds, because the pointer may never be accessed. PR: https://github.com/llvm/llvm-project/pull/127410 --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 41 ++++++++++------- ...ter-dependence-analysis-forked-pointers.ll | 44 ++++++++++++++++++- 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 5a22ac8abc3fc..5dc5b025599b1 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -798,8 +798,13 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { // The access function must stride over the innermost loop. if (Lp != AR->getLoop()) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " - << *Ptr << " SCEV: " << *AR << "\n"); + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Not striding over innermost loop "; + if (Ptr) + dbgs() << *Ptr << " "; + + dbgs() << "SCEV: " << *AR << "\n"; + }); return std::nullopt; } @@ -809,8 +814,12 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, // Calculate the pointer stride and check if it is constant. const SCEVConstant *C = dyn_cast(Step); if (!C) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr - << " SCEV: " << *AR << "\n"); + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Not a constant strided "; + if (Ptr) + dbgs() << *Ptr << " "; + dbgs() << "SCEV: " << *AR << "\n"; + }); return std::nullopt; } @@ -837,8 +846,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *L); -/// Check whether \p AR is a non-wrapping AddRec, or if \p Ptr is a non-wrapping -/// GEP. +/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use +/// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, std::optional Stride = std::nullopt) { @@ -846,12 +855,12 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, if (AR->getNoWrapFlags(SCEV::NoWrapMask)) return true; - if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) + if (Ptr && PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) return true; // The address calculation must not wrap. Otherwise, a dependence could be // inverted. - if (isNoWrapGEP(Ptr, PSE, L)) + if (Ptr && isNoWrapGEP(Ptr, PSE, L)) return true; // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, @@ -859,7 +868,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // location will be larger than half the pointer index type space. In that // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. - if (auto *GEP = dyn_cast(Ptr); + if (auto *GEP = dyn_cast_if_present(Ptr); GEP && GEP->hasNoUnsignedSignedWrap()) return true; @@ -875,7 +884,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, return true; } - if (Assume) { + if (Ptr && Assume) { PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" << "LAA: Pointer: " << *Ptr << "\n" @@ -1117,6 +1126,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, SmallVector> TranslatedPtrs = findForkedPointer(PSE, StridesMap, Ptr, TheLoop); + assert(!TranslatedPtrs.empty() && "must have some translated pointers"); /// Check whether all pointers can participate in a runtime bounds check. They /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also @@ -1142,13 +1152,10 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. - if (ShouldCheckWrap) { - // Skip wrap checking when translating pointers. - if (TranslatedPtrs.size() > 1) - return false; - - if (!isNoWrap(PSE, AR, Ptr, AccessTy, TheLoop, Assume)) - return false; + if (ShouldCheckWrap && + !isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, + TheLoop, Assume)) { + return false; } } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll index 5e9dc7f2b91cc..38b7389ae9083 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll @@ -83,10 +83,52 @@ exit: define void @dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs(ptr %a, ptr %b, ptr %c, i64 %offset, i64 %n) { ; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: cannot check memory dependencies at runtime +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP7:0x[0-9a-f]+]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP8:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 3: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP7]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 4: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 5: +; CHECK-NEXT: Comparing group ([[GRP7]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP5]]: +; CHECK-NEXT: (Low: %a High: ((4 * %n) + %a)) +; CHECK-NEXT: Member: {%a,+,4}<%loop> +; CHECK-NEXT: Group [[GRP6]]: +; CHECK-NEXT: (Low: %b High: ((4 * %n) + %b)) +; CHECK-NEXT: Member: {%b,+,4}<%loop> +; CHECK-NEXT: Group [[GRP7]]: +; CHECK-NEXT: (Low: %c High: ((4 * %n) + %c)) +; CHECK-NEXT: Member: {%c,+,4}<%loop> +; CHECK-NEXT: Group [[GRP8]]: +; CHECK-NEXT: (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a)) +; CHECK-NEXT: Member: {((4 * %offset) + %a),+,4}<%loop> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: From 760ec2c38e0cd01c016c403301e8dc081e0fc85c Mon Sep 17 00:00:00 2001 From: Md Asghar Ahmad Shahid Date: Wed, 19 Feb 2025 19:45:02 +0530 Subject: [PATCH 057/220] [MLIR][Linalg] Introduce Python API for linalg.batch_matmul Ops. (#127614) As linalg.batch_matmul has been moved into tablegen from OpDSL, its derived python wrapper no longer exist.This patch adds the required python wrapper. Also refactors the BatchmatmulOp printer to make it consistent with its parser. --- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 9 +- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 25 +++-- mlir/python/mlir/dialects/linalg/__init__.py | 41 ++++--- mlir/test/Dialect/Linalg/named-ops.mlir | 10 +- mlir/test/python/dialects/linalg/ops.py | 100 ++++++++++++++++++ 5 files changed, 153 insertions(+), 32 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 6a439bfb09078..a5725d6f1507e 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -858,7 +858,11 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz let arguments = (ins Variadic:$inputs, Variadic:$outputs, - DefaultValuedOptionalAttr:$indexing_maps + DefaultValuedOptionalAttr< + AffineMapArrayAttr, + "BatchMatmulOp::getDefaultIndexingMaps($_builder.getContext())" + >:$indexing_maps, + DefaultValuedOptionalAttr:$cast ); let results = (outs Variadic:$result_tensors); let regions = (region AnyRegion:$region); @@ -884,9 +888,10 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz }]>, OpBuilder< (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands, - CArg<"ArrayRef", "{}">:$attributes), + "Attribute":$cast, CArg<"ArrayRef", "{}">:$attributes), [{ $_state.addOperands(operands); + $_state.addAttribute("cast", cast); $_state.addAttributes(attributes); $_state.addTypes(resultTensorTypes); (void)$_state.addRegion(), diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index b756a67f3ba7a..42ea0e1197ef1 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -3951,11 +3951,18 @@ void BatchMatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, RegionBuilderHelper helper(b, block); SmallVector yields; + TypeFn castVal = TypeFn::cast_signed; + auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { + return attr.getName() == "cast"; + }); + if (castIter != attrs.end()) { + if (auto attr = llvm::dyn_cast(castIter->getValue())) + castVal = attr.getValue(); + } + auto toType = block.getArgument(2).getType(); - Value castValA = - helper.buildTypeFn(TypeFn::cast_signed, toType, block.getArgument(0)); - Value castValB = - helper.buildTypeFn(TypeFn::cast_signed, toType, block.getArgument(1)); + Value castValA = helper.buildTypeFn(castVal, toType, block.getArgument(0)); + Value castValB = helper.buildTypeFn(castVal, toType, block.getArgument(1)); Value mulVal = helper.buildBinaryFn(BinaryFn::mul, castValA, castValB); Value addVal = helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), mulVal); @@ -4004,11 +4011,6 @@ ParseResult BatchMatmulOp::parse(OpAsmParser &parser, OperationState &result) { } void BatchMatmulOp::print(OpAsmPrinter &p) { - SmallVector elidedAttrs = { - "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; - ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); - SmallVector indexingMaps = llvm::map_to_vector( BatchMatmulOp::getDefaultIndexingMaps(getContext()), [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); @@ -4018,6 +4020,11 @@ void BatchMatmulOp::print(OpAsmPrinter &p) { [&](Attribute attr) { p.printAttribute(attr); }); p << "]"; } + + SmallVector elidedAttrs = { + "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; + ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), + elidedAttrs); } /// Verify the user defined indexing maps. diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py index 5cda4769d593f..c5fbb833ee399 100644 --- a/mlir/python/mlir/dialects/linalg/__init__.py +++ b/mlir/python/mlir/dialects/linalg/__init__.py @@ -149,7 +149,8 @@ def __init__( generic = region_op(GenericOp_, terminator=YieldOp) -def matmul( +def create_op( + op_type, *ins: Union[Operation, OpView, Value], outs: Sequence[Union[Operation, OpView, Value]], indexing_maps: Optional[Sequence[AffineMapAttr]] = None, @@ -161,7 +162,7 @@ def matmul( init = _get_op_result_or_value(outs[0]) result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] - op = MatmulOp( + op = op_type( result_tensors=result_types, inputs=ins, outputs=[init], @@ -172,24 +173,32 @@ def matmul( return op +def matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + return create_op(MatmulOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast) + + +def batch_matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + return create_op( + BatchMatmulOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast + ) + + def contract( *ins: Union[Operation, OpView, Value], outs: Sequence[Union[Operation, OpView, Value]], indexing_maps: Sequence[AffineMapAttr], cast: Optional[Union[TypeFn, Attribute]] = None, ): - ins = [_get_op_result_or_value(input) for input in ins] - if len(outs) > 1: - raise ValueError(f"{outs=} must have length 1.") - init = _get_op_result_or_value(outs[0]) - result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] - - op = ContractOp( - result_tensors=result_types, - inputs=ins, - outputs=[init], - indexing_maps=indexing_maps, - cast=cast, + return create_op( + ContractOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast ) - fill_builtin_region(op.operation) - return op diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 8474eeac0db5b..1bd9c8825b05e 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1497,7 +1497,7 @@ func.func @matmul_transpose_b(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %a // CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { @@ -1520,7 +1520,7 @@ func.func @batch_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %ar // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_batch_dim_A(%arg0: memref<3x5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { @@ -1543,7 +1543,7 @@ func.func @batch_matmul_bcast_batch_dim_A(%arg0: memref<3x5xf32>, %arg1: memref< // CHECK-SAME: %[[VAL_0:.*]]: memref<2x3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_batch_and_n_dim_B(%arg0: memref<2x3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<2x3x7xf32>) { @@ -1566,7 +1566,7 @@ func.func @batch_matmul_bcast_batch_and_n_dim_B(%arg0: memref<2x3x5xf32>, %arg1: // CHECK-SAME: %[[VAL_0:.*]]: memref<2x3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } @@ -1622,7 +1622,7 @@ func.func @batch_matmul_explicit_transpose_B(%arg0: memref<2x3x5xf32>, %arg1: me // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x7x5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x7x5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x7x5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_A_transpose_B(%arg0: memref<3x5xf32>, %arg1: memref<2x7x5xf32>, %arg2: memref<2x3x7xf32>) { diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index 94f8ea4faf4a8..307a88709ad52 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -466,3 +466,103 @@ def matmul_as_contract_op( ) print(module) + + +# CHECK-LABEL: TEST: testBatchMatmulOp +@run +def testBatchMatmulOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + a_shape = (2, 4, 8) + b_shape = (2, 8, 12) + b_transposed_shape = (2, 12, 8) + c_shape = (2, 4, 12) + + dimBatch = ir.AffineDimExpr.get(0) + dimM = ir.AffineDimExpr.get(1) + dimN = ir.AffineDimExpr.get(2) + dimK = ir.AffineDimExpr.get(3) + + # CHECK: #[[$A_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> + # CHECK: #[[$BTrans_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> + # CHECK: #[[$C_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + + a_map = ir.AffineMap.get(4, 0, [dimBatch, dimM, dimK]) + b_transposed_map = ir.AffineMap.get(4, 0, [dimBatch, dimN, dimK]) + c_map = ir.AffineMap.get(4, 0, [dimBatch, dimM, dimN]) + + # CHECK: func.func @batch_matmul_op( + @func.FuncOp.from_py_func( + # CHECK-SAME: %[[A:.*]]: tensor<2x4x8xf32>, + RankedTensorType.get(a_shape, f32), + # CHECK-SAME: %[[Amem:.*]]: memref<2x4x8xf32>, + MemRefType.get(a_shape, f32), + # CHECK-SAME: %[[B:.*]]: tensor<2x8x12xf32>, + RankedTensorType.get(b_shape, f32), + # CHECK-SAME: %[[Bmem:.*]]: memref<2x8x12xf32>, + MemRefType.get(b_shape, f32), + # CHECK-SAME: %[[BTrans:.*]]: tensor<2x12x8xf32>, + RankedTensorType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[BTransmem:.*]]: memref<2x12x8xf32>, + MemRefType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[C:.*]]: tensor<2x4x12xf32>, + RankedTensorType.get(c_shape, f32), + # CHECK-SAME: %[[Cmem:.*]]: memref<2x4x12xf32>) + MemRefType.get(c_shape, f32), + ) + def batch_matmul_op(A, Amem, B, Bmem, Btransposed, Btransposedmem, C, Cmem): + # CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x8x12xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=(C.type,), + inputs=(A, B), + outputs=(C,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x8x12xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.batch_matmul(A, B, outs=(C,)) + + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<2x4x8xf32>, tensor<2x12x8xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=(C.type,), + inputs=(A, Btransposed), + outputs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<2x4x8xf32>, tensor<2x12x8xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.batch_matmul( + A, + Btransposed, + outs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + # CHECK: linalg.batch_matmul ins(%[[Amem]], %[[Bmem]] : memref<2x4x8xf32>, memref<2x8x12xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=[], + inputs=(Amem, Bmem), + outputs=(Cmem,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul ins(%[[Amem]], %[[Bmem]] : memref<2x4x8xf32>, memref<2x8x12xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + linalg.batch_matmul(Amem, Bmem, outs=(Cmem,)) + + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<2x4x8xf32>, memref<2x12x8xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=[], + inputs=(Amem, Btransposedmem), + outputs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<2x4x8xf32>, memref<2x12x8xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + linalg.batch_matmul( + Amem, + Btransposedmem, + outs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + print(module) From aa9e519b24235424df177c4e4c2806d6de9936b3 Mon Sep 17 00:00:00 2001 From: David Tenty Date: Wed, 19 Feb 2025 09:43:01 -0500 Subject: [PATCH 058/220] Revert "[PowerPC] Deprecate uses of ISD::ADDC/ISD::ADDE/ISD::SUBC/ISD::SUBE (#116984)" This reverts commit 7763119c6eb0976e4836f81c9876c49a36d46d73 (leaving the modifications from 03cb46d248b08).. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 186 +++++---------- llvm/lib/Target/PowerPC/PPCISelLowering.h | 9 +- llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 20 +- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 17 -- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 44 +--- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 7 - llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 3 - llvm/lib/Target/PowerPC/PPCRegisterInfo.td | 1 - llvm/test/CodeGen/PowerPC/adde_return_type.ll | 2 +- llvm/test/CodeGen/PowerPC/addegluecrash.ll | 24 +- llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll | 16 +- llvm/test/CodeGen/PowerPC/aix-cc-abi.ll | 8 +- .../CodeGen/PowerPC/aix-cc-byval-split.ll | 8 +- .../CodeGen/PowerPC/aix-tls-gd-longlong.ll | 48 ++-- .../PowerPC/aix-tls-le-ldst-longlong.ll | 120 +++++----- .../PowerPC/aix-tls-le-xcoff-reloc-large32.ll | 24 +- .../PowerPC/atomicrmw-cond-sub-clamp.ll | 6 +- llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll | 6 +- llvm/test/CodeGen/PowerPC/inc-of-add.ll | 2 +- llvm/test/CodeGen/PowerPC/pr35688.ll | 3 +- llvm/test/CodeGen/PowerPC/pr36292.ll | 7 +- llvm/test/CodeGen/PowerPC/pr40922.ll | 9 +- llvm/test/CodeGen/PowerPC/pr45448.ll | 12 +- llvm/test/CodeGen/PowerPC/sat-add.ll | 35 ++- llvm/test/CodeGen/PowerPC/select.ll | 20 +- llvm/test/CodeGen/PowerPC/uaddo-32.ll | 50 ++-- llvm/test/CodeGen/PowerPC/uaddo-64.ll | 82 +++---- .../umulo-128-legalisation-lowering.ll | 219 +++++++++--------- .../PowerPC/urem-seteq-illegal-types.ll | 23 +- 29 files changed, 411 insertions(+), 600 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 4720928f472b3..d6c8e8d506799 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -197,11 +197,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } setOperationAction(ISD::UADDO, RegVT, Custom); - setOperationAction(ISD::USUBO, RegVT, Custom); - - // PowerPC uses addo_carry,subo_carry to propagate carry. - setOperationAction(ISD::UADDO_CARRY, RegVT, Custom); - setOperationAction(ISD::USUBO_CARRY, RegVT, Custom); // On P10, the default lowering generates better code using the // setbc instruction. @@ -265,6 +260,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); } + // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Legal); + setOperationAction(ISD::ADDE, VT, Legal); + setOperationAction(ISD::SUBC, VT, Legal); + setOperationAction(ISD::SUBE, VT, Legal); + } + if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -1850,14 +1854,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::SETBC"; case PPCISD::SETBCR: return "PPCISD::SETBCR"; - case PPCISD::ADDC: - return "PPCISD::ADDC"; - case PPCISD::ADDE: - return "PPCISD::ADDE"; - case PPCISD::SUBC: - return "PPCISD::SUBC"; - case PPCISD::SUBE: - return "PPCISD::SUBE"; } return nullptr; } @@ -12017,74 +12013,43 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("ERROR:Should return for all cases within swtich."); } -static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value, - SelectionDAG &DAG, - const PPCSubtarget &STI) { - SDLoc DL(Value); - if (STI.useCRBits()) - Value = DAG.getNode(ISD::SELECT, DL, SumType, Value, - DAG.getConstant(1, DL, SumType), - DAG.getConstant(0, DL, SumType)); - else - Value = DAG.getZExtOrTrunc(Value, DL, SumType); - SDValue Sum = DAG.getNode(PPCISD::ADDC, DL, DAG.getVTList(SumType, MVT::i32), - Value, DAG.getAllOnesConstant(DL, SumType)); - return Sum.getValue(1); -} +SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const { + // Default to target independent lowering if there is a logical user of the + // carry-bit. + for (SDNode *U : Op->users()) { + if (U->getOpcode() == ISD::SELECT) + return SDValue(); + if (ISD::isBitwiseLogicOp(U->getOpcode())) { + for (unsigned i = 0, ie = U->getNumOperands(); i != ie; ++i) { + if (U->getOperand(i).getOpcode() != ISD::UADDO && + U->getOperand(i).getOpcode() != ISD::MERGE_VALUES) + return SDValue(); + } + } + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDLoc dl(Op); -static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag, - EVT CarryType, SelectionDAG &DAG, - const PPCSubtarget &STI) { - SDLoc DL(Flag); - SDValue Zero = DAG.getConstant(0, DL, SumType); - SDValue Carry = DAG.getNode( - PPCISD::ADDE, DL, DAG.getVTList(SumType, MVT::i32), Zero, Zero, Flag); - if (STI.useCRBits()) - return DAG.getSetCC(DL, CarryType, Carry, Zero, ISD::SETNE); - return DAG.getZExtOrTrunc(Carry, DL, CarryType); -} + // Default to target independent lowering for special cases handled there. + if (isOneConstant(RHS) || isAllOnesConstant(RHS)) + return SDValue(); -SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getNode()->getValueType(0); - SDLoc DL(Op); - SDNode *N = Op.getNode(); - EVT VT = N->getValueType(0); - EVT CarryType = N->getValueType(1); - unsigned Opc = N->getOpcode(); - bool IsAdd = Opc == ISD::UADDO; - Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC; - SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), - N->getOperand(0), N->getOperand(1)); - SDValue Carry = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, - DAG, Subtarget); - if (!IsAdd) - Carry = DAG.getNode(ISD::XOR, DL, CarryType, Carry, - DAG.getAllOnesConstant(DL, CarryType)); - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, Carry); -} - -SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDNode *N = Op.getNode(); - unsigned Opc = N->getOpcode(); - EVT VT = N->getValueType(0); - EVT CarryType = N->getValueType(1); - SDValue CarryOp = N->getOperand(2); - bool IsAdd = Opc == ISD::UADDO_CARRY; - Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE; - if (!IsAdd) - CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, - DAG.getAllOnesConstant(DL, CarryOp.getValueType())); - CarryOp = ConvertCarryValueToCarryFlag(VT, CarryOp, DAG, Subtarget); - SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), - Op.getOperand(0), Op.getOperand(1), CarryOp); - CarryOp = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, DAG, - Subtarget); - if (!IsAdd) - CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, - DAG.getAllOnesConstant(DL, CarryOp.getValueType())); - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, CarryOp); + SDValue ADDC; + SDValue Overflow; + SDVTList VTs = Op.getNode()->getVTList(); + + ADDC = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), LHS, RHS); + Overflow = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(VT, MVT::Glue), + DAG.getConstant(0, dl, VT), DAG.getConstant(0, dl, VT), + ADDC.getValue(1)); + SDValue OverflowTrunc = + DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow); + SDValue Res = + DAG.getNode(ISD::MERGE_VALUES, dl, VTs, ADDC.getValue(0), OverflowTrunc); + return Res; } SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { @@ -12115,8 +12080,8 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - default: - llvm_unreachable("Wasn't expecting to be able to lower this!"); + default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::UADDO: return LowerUaddo(Op, DAG); case ISD::FPOW: return lowerPow(Op, DAG); case ISD::FSIN: return lowerSin(Op, DAG); case ISD::FCOS: return lowerCos(Op, DAG); @@ -12209,12 +12174,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerATOMIC_LOAD_STORE(Op, DAG); case ISD::IS_FPCLASS: return LowerIS_FPCLASS(Op, DAG); - case ISD::UADDO: - case ISD::USUBO: - return LowerADDSUBO(Op, DAG); - case ISD::UADDO_CARRY: - case ISD::USUBO_CARRY: - return LowerADDSUBO_CARRY(Op, DAG); } } @@ -16150,21 +16109,6 @@ static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) { return true; } -static SDValue DAGCombineAddc(SDNode *N, - llvm::PPCTargetLowering::DAGCombinerInfo &DCI) { - if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(1)) { - // (ADDC (ADDE 0, 0, C), -1) -> C - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS->getOpcode() == PPCISD::ADDE && - isNullConstant(LHS->getOperand(0)) && - isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) { - return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); - } - } - return SDValue(); -} - SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -16953,8 +16897,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); - case PPCISD::ADDC: - return DAGCombineAddc(N, DCI); } return SDValue(); @@ -17008,16 +16950,6 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero = 0xFFFF0000; break; } - case PPCISD::ADDE: { - if (Op.getResNo() == 0) { - // (0|1), _ = ADDE 0, 0, CARRY - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - if (isNullConstant(LHS) && isNullConstant(RHS)) - Known.Zero = ~1ULL; - } - break; - } case ISD::INTRINSIC_WO_CHAIN: { switch (Op.getConstantOperandVal(0)) { default: break; @@ -18287,8 +18219,7 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, return SDValue(); SDLoc DL(N); - EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - SDVTList VTs = DAG.getVTList(MVT::i64, CarryType); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue); SDValue Cmp = RHS.getOperand(0); SDValue Z = Cmp.getOperand(0); auto *Constant = cast(Cmp.getOperand(1)); @@ -18306,14 +18237,11 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Addc = - DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64), - DAG.getConstant(0, DL, CarryType)); - return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, - DAG.getConstant(0, DL, MVT::i64), + SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64)); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), SDValue(Addc.getNode(), 1)); - } + } case ISD::SETEQ: { // when C == 0 // --> addze X, (subfic Z, 0).carry @@ -18324,15 +18252,11 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Subc = - DAG.getNode(ISD::USUBO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - DAG.getConstant(0, DL, MVT::i64), AddOrZ, - DAG.getConstant(0, DL, CarryType)); - SDValue Invert = DAG.getNode(ISD::XOR, DL, CarryType, Subc.getValue(1), - DAG.getAllOnesConstant(DL, CarryType)); - return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, - DAG.getConstant(0, DL, MVT::i64), Invert); - } + SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + DAG.getConstant(0, DL, MVT::i64), AddOrZ); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), + SDValue(Subc.getNode(), 1)); + } } return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 2d86a224b54c1..514329bbe92d7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -161,12 +161,6 @@ namespace llvm { SRA, SHL, - /// These nodes represent PPC arithmetic operations with carry. - ADDC, - ADDE, - SUBC, - SUBE, - /// FNMSUB - Negated multiply-subtract instruction. FNMSUB, @@ -1286,6 +1280,7 @@ namespace llvm { SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUaddo(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; @@ -1321,8 +1316,6 @@ namespace llvm { SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerToLibCall(const char *LibCallName, SDValue Op, SelectionDAG &DAG) const; SDValue lowerLibCallBasedOnType(const char *LibCallFloatName, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 4205b3086a3c9..bcac0de55d9d3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -760,13 +760,13 @@ def STFDXTLS : XForm_8<31, 727, (outs), (ins f8rc:$RST, ptr_rc_nor0:$RA, tlsreg: let isCommutable = 1 in defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCaddc i64:$RA, i64:$RB))]>, + [(set i64:$RT, (addc i64:$RA, i64:$RB))]>, PPC970_DGroup_Cracked; let Defs = [CARRY] in def ADDIC8 : DForm_2<12, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (PPCaddc i64:$RA, imm64SExt16:$D))]>; + [(set i64:$RST, (addc i64:$RA, imm64SExt16:$D))]>; def ADDI8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), "addi $RST, $RA, $D", IIC_IntSimple, [(set i64:$RST, (add i64:$RA, imm64SExt16:$D))]>; @@ -782,11 +782,11 @@ def LA8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), let Defs = [CARRY] in { def SUBFIC8: DForm_2< 8, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (PPCsubc imm64SExt16:$D, i64:$RA))]>; + [(set i64:$RST, (subc imm64SExt16:$D, i64:$RA))]>; } defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCsubc i64:$RB, i64:$RA))]>, + [(set i64:$RT, (subc i64:$RB, i64:$RA))]>, PPC970_DGroup_Cracked; defm SUBF8 : XOForm_1rx<31, 40, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subf", "$RT, $RA, $RB", IIC_IntGeneral, @@ -798,22 +798,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, i64:$RB, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, i64:$RB))]>; defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, -1, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, -1))]>; defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, 0, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, 0))]>; defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCsube i64:$RB, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube i64:$RB, i64:$RA))]>; defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCsube -1, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube -1, i64:$RA))]>; defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCsube 0, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube 0, i64:$RA))]>; } } // isCodeGenOnly diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 6e0640fa715ea..3aef6f2c893fa 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1758,23 +1758,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; - } else if ((PPC::G8RCRegClass.contains(DestReg) || - PPC::GPRCRegClass.contains(DestReg)) && - SrcReg == PPC::CARRY) { - bool Is64Bit = PPC::G8RCRegClass.contains(DestReg); - BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MFSPR8 : PPC::MFSPR), DestReg) - .addImm(1) - .addReg(PPC::CARRY, RegState::Implicit); - return; - } else if ((PPC::G8RCRegClass.contains(SrcReg) || - PPC::GPRCRegClass.contains(SrcReg)) && - DestReg == PPC::CARRY) { - bool Is64Bit = PPC::G8RCRegClass.contains(SrcReg); - BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MTSPR8 : PPC::MTSPR)) - .addImm(1) - .addReg(SrcReg) - .addReg(PPC::CARRY, RegState::ImplicitDefine); - return; } unsigned Opc; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index b5ed5d55da4c7..be90a5c562c57 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -124,21 +124,6 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> ]>; -// RES, CARRY = op LHS, RHS -def SDT_PPCBinaryArithWithFlagsOut : SDTypeProfile<2, 2, [ - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisVT<1, i32>, -]>; - -// RES, CARRY = op LHS, RHS, CARRY -def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [ - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisSameAs<1, 4>, - SDTCisVT<1, i32>, -]>; - //===----------------------------------------------------------------------===// // PowerPC specific DAG Nodes. // @@ -416,15 +401,6 @@ def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR", def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR", SDTIntUnaryOp, []>; -def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut, - [SDNPCommutative]>; -def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut, - []>; -def PPCsubc : SDNode<"PPCISD::SUBC", SDT_PPCBinaryArithWithFlagsOut, - []>; -def PPCsube : SDNode<"PPCISD::SUBE", SDT_PPCBinaryArithWithFlagsInOut, - []>; - //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. // @@ -2315,7 +2291,7 @@ let BaseName = "addic" in { let Defs = [CARRY] in def ADDIC : DForm_2<12, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (PPCaddc i32:$RA, imm32SExt16:$D))]>, + [(set i32:$RST, (addc i32:$RA, imm32SExt16:$D))]>, RecFormRel, PPC970_DGroup_Cracked; let Defs = [CARRY, CR0] in def ADDIC_rec : DForm_2<13, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), @@ -2336,7 +2312,7 @@ def MULLI : DForm_2< 7, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), let Defs = [CARRY] in def SUBFIC : DForm_2< 8, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (PPCsubc imm32SExt16:$D, i32:$RA))]>; + [(set i32:$RST, (subc imm32SExt16:$D, i32:$RA))]>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def LI : DForm_2_r0<14, (outs gprc:$RST), (ins s16imm:$D), @@ -2933,7 +2909,7 @@ def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$RT), (ins gprc:$RA, tlsreg32:$RB let isCommutable = 1 in defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCaddc i32:$RA, i32:$RB))]>, + [(set i32:$RT, (addc i32:$RA, i32:$RB))]>, PPC970_DGroup_Cracked; defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), @@ -2966,7 +2942,7 @@ defm SUBF : XOForm_1rx<31, 40, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), [(set i32:$RT, (sub i32:$RB, i32:$RA))]>; defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCsubc i32:$RB, i32:$RA))]>, + [(set i32:$RT, (subc i32:$RB, i32:$RA))]>, PPC970_DGroup_Cracked; defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$RT), (ins gprc:$RA), "neg", "$RT, $RA", IIC_IntSimple, @@ -2975,22 +2951,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, i32:$RB, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, i32:$RB))]>; defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$RT), (ins gprc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, -1, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, -1))]>; defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$RT), (ins gprc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, 0, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, 0))]>; defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCsube i32:$RB, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube i32:$RB, i32:$RA))]>; defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$RT), (ins gprc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCsube -1, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube -1, i32:$RA))]>; defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$RT), (ins gprc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCsube 0, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube 0, i32:$RA))]>; } } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 2177dba1e5762..b60a91be82406 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -625,13 +625,6 @@ bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg, return BaseImplRetVal; } -const TargetRegisterClass * -PPCRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { - if (RC == &PPC::CARRYRCRegClass) - return TM.isPPC64() ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - return RC; -} - unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 21b6f7b13939a..274c7cb68ae0a 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -76,9 +76,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override; - const TargetRegisterClass * - getCrossCopyRegClass(const TargetRegisterClass *RC) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 8b690b7b833b3..3cb7cd9d8f229 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -494,7 +494,6 @@ def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> { def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> { let CopyCost = -1; - let isAllocatable = 0; } // Make AllocationOrder as similar as G8RC's to avoid potential spilling. diff --git a/llvm/test/CodeGen/PowerPC/adde_return_type.ll b/llvm/test/CodeGen/PowerPC/adde_return_type.ll index 47c5efc35afc6..7ce11079a6267 100644 --- a/llvm/test/CodeGen/PowerPC/adde_return_type.ll +++ b/llvm/test/CodeGen/PowerPC/adde_return_type.ll @@ -3,7 +3,7 @@ ; RUN: < %s -o /dev/null 2>&1 | FileCheck %s define i64 @testAddeReturnType(i64 %X, i64 %Z) { -; CHECK: Legally typed node: {{.*}}: i64,i1 = uaddo {{.*}} +; CHECK: Legally typed node: {{.*}}: i64,glue = adde {{.*}} %cmp = icmp ne i64 %Z, 0 %conv1 = zext i1 %cmp to i64 %add = add nsw i64 %conv1, %X diff --git a/llvm/test/CodeGen/PowerPC/addegluecrash.ll b/llvm/test/CodeGen/PowerPC/addegluecrash.ll index 7cd94c0e4c2d5..a711b09b9bdfd 100644 --- a/llvm/test/CodeGen/PowerPC/addegluecrash.ll +++ b/llvm/test/CodeGen/PowerPC/addegluecrash.ll @@ -9,20 +9,20 @@ define void @bn_mul_comba8(ptr nocapture %r, ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: std 4, -8(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 4, 3 ; CHECK-NEXT: ld 3, -8(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 6, 0(3) -; CHECK-NEXT: ld 11, 0(5) -; CHECK-NEXT: mulhdu 8, 11, 6 +; CHECK-NEXT: ld 9, 0(3) +; CHECK-NEXT: ld 8, 0(5) +; CHECK-NEXT: mulhdu 7, 8, 9 ; CHECK-NEXT: ld 3, 8(3) -; CHECK-NEXT: mulld 7, 3, 6 -; CHECK-NEXT: addc 9, 7, 8 -; CHECK-NEXT: ld 10, 8(5) -; CHECK-NEXT: mulhdu 5, 10, 11 -; CHECK-NEXT: mulld 10, 10, 11 -; CHECK-NEXT: addc 9, 9, 10 +; CHECK-NEXT: mulld 6, 3, 9 +; CHECK-NEXT: mulhdu 3, 3, 9 +; CHECK-NEXT: addc 6, 6, 7 +; CHECK-NEXT: addze 3, 3 +; CHECK-NEXT: ld 5, 8(5) +; CHECK-NEXT: mulld 7, 5, 8 +; CHECK-NEXT: mulhdu 5, 5, 8 +; CHECK-NEXT: addc 6, 6, 7 ; CHECK-NEXT: addze 5, 5 -; CHECK-NEXT: addc 7, 7, 8 -; CHECK-NEXT: mulhdu 3, 3, 6 -; CHECK-NEXT: adde 3, 5, 3 +; CHECK-NEXT: add 3, 5, 3 ; CHECK-NEXT: cmpld 3, 5 ; CHECK-NEXT: crmove 20, 0 ; CHECK-NEXT: li 5, 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll index aead5762d0921..501227c9072c4 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll @@ -1103,13 +1103,13 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0) ; 32BIT-NEXT: renamable $r12 = LWZ 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4) ; 32BIT-NEXT: renamable $r0 = LBZ 3, %fixed-stack.1 :: (load (s8) from %fixed-stack.1 + 3, basealign 4) - ; 32BIT-NEXT: renamable $r31 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) - ; 32BIT-NEXT: renamable $r30 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) + ; 32BIT-NEXT: renamable $r31 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) + ; 32BIT-NEXT: renamable $r30 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) ; 32BIT-NEXT: renamable $r29 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5, align 8) ; 32BIT-NEXT: renamable $r28 = LBZ 3, %fixed-stack.6 :: (load (s8) from %fixed-stack.6 + 3, basealign 4) ; 32BIT-NEXT: renamable $r27 = LHA 2, %fixed-stack.7 :: (load (s16) from %fixed-stack.7 + 2, basealign 4) - ; 32BIT-NEXT: renamable $r26 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) - ; 32BIT-NEXT: renamable $r25 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r26 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r25 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6 @@ -1120,8 +1120,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10 ; 32BIT-NEXT: renamable $r6 = SRAWI renamable $r3, 31, implicit-def dead $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r25, implicit-def $carry - ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r26, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r26, implicit-def $carry + ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r25, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r7 = SRAWI renamable $r27, 31, implicit-def dead $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r27, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r7, implicit-def dead $carry, implicit $carry @@ -1131,8 +1131,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r6, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r12, implicit-def $carry ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r6, killed renamable $r4, implicit-def dead $carry, implicit $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r30, implicit-def $carry - ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r31, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r31, implicit-def $carry + ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r30, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r0, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r4, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r4 = ADDC killed renamable $r3, killed renamable $r11, implicit-def $carry diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll index 8f33f5ef863e6..79c59e925302a 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll @@ -1213,14 +1213,14 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; ASM32PWR4-NEXT: addc 3, 3, 6 ; ASM32PWR4-NEXT: addze 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 9 -; ASM32PWR4-NEXT: lwz 7, 84(1) +; ASM32PWR4-NEXT: lwz 5, 84(1) ; ASM32PWR4-NEXT: addze 6, 6 ; ASM32PWR4-NEXT: addc 3, 3, 31 -; ASM32PWR4-NEXT: lwz 5, 80(1) +; ASM32PWR4-NEXT: lwz 7, 80(1) ; ASM32PWR4-NEXT: adde 6, 6, 30 -; ASM32PWR4-NEXT: addc 3, 3, 7 +; ASM32PWR4-NEXT: addc 3, 3, 5 ; ASM32PWR4-NEXT: lbz 8, 91(1) -; ASM32PWR4-NEXT: adde 5, 6, 5 +; ASM32PWR4-NEXT: adde 5, 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 8 ; ASM32PWR4-NEXT: lbz 6, 103(1) ; ASM32PWR4-NEXT: addze 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll index 9b1893b111556..f1bf7c262317d 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll @@ -36,17 +36,17 @@ entry: ; CHECK32: bb.0.entry: ; CHECK32-NEXT: liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10 -; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 80, %fixed-stack.0 +; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 84, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4 -; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 84, %fixed-stack.0 +; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 80, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8 ; CHECK32-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12 ; CHECK32-DAG: STW renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16 ; CHECK32-DAG: STW renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20 ; CHECK32-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24 -; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG2]], implicit-def $carry -; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG1]], implicit-def dead $carry, implicit killed $carry +; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG1]], implicit-def $carry +; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG2]], implicit-def dead $carry, implicit killed $carry ; CHECK32 STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28 ; CHECK32: BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll index 5f471ce83828a..53a7cb0aad9ee 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll @@ -325,12 +325,12 @@ define i64 @loadsTGInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 0(3) -; SMALL32-NEXT: lwz 3, 4(3) +; SMALL32-NEXT: lwz 5, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) +; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 3 -; SMALL32-NEXT: adde 3, 7, 5 +; SMALL32-NEXT: addc 4, 6, 5 +; SMALL32-NEXT: adde 3, 7, 3 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -346,14 +346,14 @@ define i64 @loadsTGInit() #1 { ; LARGE32-NEXT: lwz 3, L..C0@l(3) ; LARGE32-NEXT: lwz 4, L..C1@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 5, 0(3) -; LARGE32-NEXT: lwz 3, 4(3) -; LARGE32-NEXT: addis 4, L..C7@u(2) -; LARGE32-NEXT: lwz 4, L..C7@l(4) -; LARGE32-NEXT: lwz 6, 4(4) -; LARGE32-NEXT: lwz 7, 0(4) -; LARGE32-NEXT: addc 4, 6, 3 -; LARGE32-NEXT: adde 3, 7, 5 +; LARGE32-NEXT: lwz 4, 4(3) +; LARGE32-NEXT: lwz 3, 0(3) +; LARGE32-NEXT: addis 5, L..C7@u(2) +; LARGE32-NEXT: lwz 5, L..C7@l(5) +; LARGE32-NEXT: lwz 6, 4(5) +; LARGE32-NEXT: lwz 5, 0(5) +; LARGE32-NEXT: addc 4, 6, 4 +; LARGE32-NEXT: adde 3, 5, 3 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 @@ -589,12 +589,12 @@ define i64 @loadsTWInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 0(3) -; SMALL32-NEXT: lwz 3, 4(3) +; SMALL32-NEXT: lwz 5, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) +; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 3 -; SMALL32-NEXT: adde 3, 7, 5 +; SMALL32-NEXT: addc 4, 6, 5 +; SMALL32-NEXT: adde 3, 7, 3 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -610,14 +610,14 @@ define i64 @loadsTWInit() #1 { ; LARGE32-NEXT: lwz 3, L..C5@l(3) ; LARGE32-NEXT: lwz 4, L..C6@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 5, 0(3) -; LARGE32-NEXT: lwz 3, 4(3) -; LARGE32-NEXT: addis 4, L..C7@u(2) -; LARGE32-NEXT: lwz 4, L..C7@l(4) -; LARGE32-NEXT: lwz 6, 4(4) -; LARGE32-NEXT: lwz 7, 0(4) -; LARGE32-NEXT: addc 4, 6, 3 -; LARGE32-NEXT: adde 3, 7, 5 +; LARGE32-NEXT: lwz 4, 4(3) +; LARGE32-NEXT: lwz 3, 0(3) +; LARGE32-NEXT: addis 5, L..C7@u(2) +; LARGE32-NEXT: lwz 5, L..C7@l(5) +; LARGE32-NEXT: lwz 6, 4(5) +; LARGE32-NEXT: lwz 5, 0(5) +; LARGE32-NEXT: addc 4, 6, 4 +; LARGE32-NEXT: adde 3, 5, 3 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll index 533c866eb4e12..c2d7325107a84 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll @@ -304,15 +304,15 @@ define i64 @loadITLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C0(r2) # target-flags(ppc-tprel) @IThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -327,14 +327,14 @@ define i64 @loadITLUninit2() { ; LARGE32-NEXT: lwz r4, L..C0@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -424,15 +424,15 @@ define i64 @loadITLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C1(r2) # target-flags(ppc-tprel) @IThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -447,14 +447,14 @@ define i64 @loadITLInit2() { ; LARGE32-NEXT: lwz r4, L..C1@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -544,15 +544,15 @@ define i64 @loadTLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C2(r2) # target-flags(ppc-tprel) @ThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -567,14 +567,14 @@ define i64 @loadTLUninit2() { ; LARGE32-NEXT: lwz r4, L..C2@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -664,15 +664,15 @@ define i64 @loadTLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C3(r2) # target-flags(ppc-tprel) @ThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -687,14 +687,14 @@ define i64 @loadTLInit2() { ; LARGE32-NEXT: lwz r4, L..C3@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll index 268402170063e..6c0ea782c2a38 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll @@ -290,16 +290,16 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} bla 0 ; DIS-NEXT: {{0*}}[[#ADDR]]: R_RBA (idx: [[#NFA+1]]) .__get_tpointer[PR] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 4, 2, 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 5, 2, 0 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 8(4) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 8(5) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 0(4) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(4) -; DIS-NEXT: addc 4, 4, 3 -; DIS-NEXT: adde 3, 6, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 4(5) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(5) +; DIS-NEXT: addc 4, 6, 4 +; DIS-NEXT: adde 3, 5, 3 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 @@ -324,10 +324,10 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 12(4) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+27]]) IThreadLocalVarUninit2[TE] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) -; DIS-NEXT: addic 4, 3, 1 -; DIS-NEXT: addze 3, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) +; DIS-NEXT: addic 4, 4, 1 +; DIS-NEXT: addze 3, 3 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll index 4f00cff83942a..0ff2f28207ed4 100644 --- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll @@ -357,10 +357,10 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: .LBB7_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB7_4 Depth 2 -; CHECK-NEXT: subc 5, 6, 4 +; CHECK-NEXT: sub 5, 6, 4 +; CHECK-NEXT: cmpld 5, 6 ; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: addze. 8, 7 -; CHECK-NEXT: beq 0, .LBB7_4 +; CHECK-NEXT: bgt 0, .LBB7_4 ; CHECK-NEXT: # %bb.3: # %atomicrmw.start ; CHECK-NEXT: # ; CHECK-NEXT: mr 7, 5 diff --git a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll index 29e7a16739864..34091ba46c3f6 100644 --- a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll +++ b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll @@ -12,11 +12,11 @@ define double @postinctodbl(ptr nocapture %llp) #0 { ; CHECK-NEXT: addic 4, 4, 1 ; CHECK-NEXT: lwz 5, 0(3) ; CHECK-NEXT: stw 5, 8(1) +; CHECK-NEXT: addze 5, 5 ; CHECK-NEXT: lfd 0, 8(1) -; CHECK-NEXT: stw 4, 4(3) -; CHECK-NEXT: addze 4, 5 +; CHECK-NEXT: stw 5, 0(3) ; CHECK-NEXT: fcfid 1, 0 -; CHECK-NEXT: stw 4, 0(3) +; CHECK-NEXT: stw 4, 4(3) ; CHECK-NEXT: addi 1, 1, 16 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index 432b5a6b362fe..98b812e7845a5 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -412,8 +412,8 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; PPC32-NEXT: not 4, 4 ; PPC32-NEXT: not 3, 3 ; PPC32-NEXT: subc 4, 8, 4 -; PPC32-NEXT: subfe 3, 3, 7 ; PPC32-NEXT: not 6, 6 +; PPC32-NEXT: subfe 3, 3, 7 ; PPC32-NEXT: not 5, 5 ; PPC32-NEXT: subc 6, 10, 6 ; PPC32-NEXT: subfe 5, 5, 9 diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll index 5746934802eb2..8a4351b229fd1 100644 --- a/llvm/test/CodeGen/PowerPC/pr35688.ll +++ b/llvm/test/CodeGen/PowerPC/pr35688.ll @@ -8,9 +8,10 @@ define void @ec_GFp_nistp256_points_mul() { ; CHECK-LABEL: ec_GFp_nistp256_points_mul: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ld 3, 0(3) -; CHECK-NEXT: subfic 4, 3, 0 ; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: subfic 5, 3, 0 ; CHECK-NEXT: subfze 5, 4 +; CHECK-NEXT: sradi 5, 5, 63 ; CHECK-NEXT: subc 3, 5, 3 ; CHECK-NEXT: subfe 3, 4, 5 ; CHECK-NEXT: sradi 3, 3, 63 diff --git a/llvm/test/CodeGen/PowerPC/pr36292.ll b/llvm/test/CodeGen/PowerPC/pr36292.ll index 98d94646bce65..1794b3ba526ed 100644 --- a/llvm/test/CodeGen/PowerPC/pr36292.ll +++ b/llvm/test/CodeGen/PowerPC/pr36292.ll @@ -12,12 +12,11 @@ define void @test() nounwind comdat { ; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill ; CHECK-NEXT: stdu 1, -64(1) ; CHECK-NEXT: std 0, 80(1) -; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: ld 30, 32(1) -; CHECK-NEXT: subc 3, 3, 30 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: iseleq 3, 0, 3 +; CHECK-NEXT: sub 4, 3, 30 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: iselgt 3, 0, 4 ; CHECK-NEXT: addi 29, 3, 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %forcond diff --git a/llvm/test/CodeGen/PowerPC/pr40922.ll b/llvm/test/CodeGen/PowerPC/pr40922.ll index ed840ad12b7ed..9252e9a3e3aa4 100644 --- a/llvm/test/CodeGen/PowerPC/pr40922.ll +++ b/llvm/test/CodeGen/PowerPC/pr40922.ll @@ -23,10 +23,11 @@ define i32 @a() { ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: addic 6, 4, 6 -; CHECK-NEXT: addze. 5, 5 -; CHECK-NEXT: rlwinm 5, 6, 0, 28, 26 -; CHECK-NEXT: cmplw 1, 5, 4 -; CHECK-NEXT: crnand 20, 4, 2 +; CHECK-NEXT: addze 5, 5 +; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26 +; CHECK-NEXT: andi. 5, 5, 1 +; CHECK-NEXT: cmplw 1, 6, 4 +; CHECK-NEXT: crorc 20, 1, 4 ; CHECK-NEXT: bc 12, 20, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: bl e diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll index 0edbae47e9378..0f2dcb3ccc8a0 100644 --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -22,14 +22,12 @@ define hidden void @julia_tryparse_internal_45896() #0 { ; CHECK-NEXT: li r5, -3 ; CHECK-NEXT: sradi r4, r3, 63 ; CHECK-NEXT: rldic r5, r5, 4, 32 -; CHECK-NEXT: mulld r6, r4, r5 ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: mulhdu r4, r4, r5 -; CHECK-NEXT: addc r3, r3, r6 -; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: addze r3, r3 -; CHECK-NEXT: or. r3, r4, r3 -; CHECK-NEXT: beq cr0, .LBB0_9 +; CHECK-NEXT: maddld r6, r4, r5, r3 +; CHECK-NEXT: cmpld cr1, r6, r3 +; CHECK-NEXT: mulhdu. r3, r4, r5 +; CHECK-NEXT: crorc 4*cr5+lt, 4*cr1+lt, eq +; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_9 ; CHECK-NEXT: # %bb.8: # %L917 ; CHECK-NEXT: .LBB0_9: # %L994 top: diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index d9b22bda85e44..8fff2c28da245 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -156,11 +156,10 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addic 3, 3, 42 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: addi 4, 3, 42 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, %a @@ -171,11 +170,10 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addic 3, 3, 42 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: addi 4, 3, 42 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, -43 @@ -348,11 +346,10 @@ define i64 @unsigned_sat_variable_i64_using_min(i64 %x, i64 %y) { define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) { ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: addc 3, 3, 4 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: add 4, 3, 4 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, %y %c = icmp ugt i64 %x, %a @@ -862,11 +859,9 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { define i64 @unsigned_sat_constant_i64_with_single_use(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_with_single_use: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 4 -; CHECK-NEXT: subc 3, 3, 4 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: iseleq 3, 0, 3 +; CHECK-NEXT: addi 4, 3, -4 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: iselgt 3, 0, 4 ; CHECK-NEXT: blr %umin = call i64 @llvm.umin.i64(i64 %x, i64 4) %sub = sub i64 %x, %umin diff --git a/llvm/test/CodeGen/PowerPC/select.ll b/llvm/test/CodeGen/PowerPC/select.ll index 10661030da8d8..289f83c475ff3 100644 --- a/llvm/test/CodeGen/PowerPC/select.ll +++ b/llvm/test/CodeGen/PowerPC/select.ll @@ -135,22 +135,18 @@ define i64 @f4_sge_0(i64 %x) { ; ; CHECK-32-LABEL: f4_sge_0: ; CHECK-32: # %bb.0: -; CHECK-32-NEXT: mr r6, r4 +; CHECK-32-NEXT: mr r5, r4 ; CHECK-32-NEXT: subfic r4, r4, 0 +; CHECK-32-NEXT: mr r6, r3 ; CHECK-32-NEXT: cmpwi r3, -1 -; CHECK-32-NEXT: subfze r5, r3 -; CHECK-32-NEXT: ble cr0, .LBB5_3 +; CHECK-32-NEXT: subfze r3, r3 +; CHECK-32-NEXT: bgt cr0, .LBB5_2 ; CHECK-32-NEXT: # %bb.1: -; CHECK-32-NEXT: ble cr0, .LBB5_4 +; CHECK-32-NEXT: mr r3, r6 ; CHECK-32-NEXT: .LBB5_2: -; CHECK-32-NEXT: mr r3, r5 -; CHECK-32-NEXT: blr -; CHECK-32-NEXT: .LBB5_3: -; CHECK-32-NEXT: mr r4, r6 -; CHECK-32-NEXT: bgt cr0, .LBB5_2 -; CHECK-32-NEXT: .LBB5_4: -; CHECK-32-NEXT: mr r5, r3 -; CHECK-32-NEXT: mr r3, r5 +; CHECK-32-NEXT: bgtlr cr0 +; CHECK-32-NEXT: # %bb.3: +; CHECK-32-NEXT: mr r4, r5 ; CHECK-32-NEXT: blr %c = icmp sge i64 %x, 0 %x.neg = sub i64 0, %x diff --git a/llvm/test/CodeGen/PowerPC/uaddo-32.ll b/llvm/test/CodeGen/PowerPC/uaddo-32.ll index 5dd5a2672b166..b5989fc2ee2da 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-32.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-32.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM -; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s define noundef i32 @add(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 6 -; LINUXASM-NEXT: stw 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: stw 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 @@ -31,22 +22,13 @@ entry: declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) define noundef zeroext i1 @add_overflow(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add_overflow: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 4, 3, 4 -; LINUXASM-NEXT: addze 3, 6 -; LINUXASM-NEXT: stw 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add_overflow: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 4, 3, 4 -; AIXASM-NEXT: li 3, 0 -; AIXASM-NEXT: addze 3, 3 -; AIXASM-NEXT: stw 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/uaddo-64.ll b/llvm/test/CodeGen/PowerPC/uaddo-64.ll index 98e834f29467c..3c7ab2c2bab79 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-64.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-64.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM -; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM +; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s define noundef i64 @add(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 6 -; LINUXASM-NEXT: std 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: std 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -31,22 +22,13 @@ entry: declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) define noundef zeroext i1 @add_overflow(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add_overflow: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 4, 3, 4 -; LINUXASM-NEXT: addze 3, 6 -; LINUXASM-NEXT: std 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add_overflow: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 4, 3, 4 -; AIXASM-NEXT: li 3, 0 -; AIXASM-NEXT: addze 3, 3 -; AIXASM-NEXT: std 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -56,28 +38,16 @@ entry: } define noundef i64 @addWithCarryIn (i64 noundef %a, i64 noundef %b, i64 noundef %c, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: addWithCarryIn: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 7, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 7 -; LINUXASM-NEXT: addc 3, 3, 5 -; LINUXASM-NEXT: addze 5, 7 -; LINUXASM-NEXT: or 4, 4, 5 -; LINUXASM-NEXT: std 4, 0(6) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .addWithCarryIn: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 7, 4 -; AIXASM-NEXT: addc 3, 3, 5 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: or 4, 7, 4 -; AIXASM-NEXT: std 4, 0(6) -; AIXASM-NEXT: blr - +; CHECK-LABEL: addWithCarryIn: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 7 +; CHECK-NEXT: addc 3, 3, 5 +; CHECK-NEXT: addze 5, 7 +; CHECK-NEXT: or 4, 4, 5 +; CHECK-NEXT: std 4, 0(6) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index f573fdab1b153..84895e74f18d5 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -5,134 +5,137 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC64-LABEL: muloti_test: ; PPC64: # %bb.0: # %start -; PPC64-NEXT: addic 9, 5, -1 -; PPC64-NEXT: mulld 10, 5, 4 -; PPC64-NEXT: mulld 11, 3, 6 -; PPC64-NEXT: subfe 9, 9, 5 -; PPC64-NEXT: add 10, 11, 10 -; PPC64-NEXT: addic 11, 3, -1 -; PPC64-NEXT: mulhdu 8, 3, 6 -; PPC64-NEXT: subfe 3, 11, 3 -; PPC64-NEXT: and 3, 3, 9 -; PPC64-NEXT: addic 9, 8, -1 -; PPC64-NEXT: subfe 8, 9, 8 -; PPC64-NEXT: or 3, 3, 8 -; PPC64-NEXT: mulhdu 5, 5, 4 ; PPC64-NEXT: addic 8, 5, -1 +; PPC64-NEXT: mulhdu 9, 5, 4 +; PPC64-NEXT: mulld 10, 5, 4 ; PPC64-NEXT: subfe 5, 8, 5 -; PPC64-NEXT: li 7, 0 -; PPC64-NEXT: or 5, 3, 5 -; PPC64-NEXT: mulhdu 8, 4, 6 -; PPC64-NEXT: addc 3, 8, 10 -; PPC64-NEXT: addze 7, 7 -; PPC64-NEXT: addic 8, 7, -1 -; PPC64-NEXT: subfe 7, 8, 7 +; PPC64-NEXT: mulld 8, 3, 6 +; PPC64-NEXT: add 8, 8, 10 +; PPC64-NEXT: addic 10, 3, -1 +; PPC64-NEXT: mulhdu 7, 3, 6 +; PPC64-NEXT: subfe 3, 10, 3 +; PPC64-NEXT: and 5, 3, 5 +; PPC64-NEXT: addic 3, 7, -1 +; PPC64-NEXT: subfe 7, 3, 7 +; PPC64-NEXT: or 5, 5, 7 +; PPC64-NEXT: mulhdu 10, 4, 6 +; PPC64-NEXT: addic 7, 9, -1 +; PPC64-NEXT: add 3, 10, 8 +; PPC64-NEXT: subfe 7, 7, 9 +; PPC64-NEXT: or 5, 5, 7 +; PPC64-NEXT: subc 7, 3, 10 +; PPC64-NEXT: subfe 7, 3, 3 +; PPC64-NEXT: neg 7, 7 ; PPC64-NEXT: or 5, 5, 7 ; PPC64-NEXT: mulld 4, 4, 6 ; PPC64-NEXT: blr ; ; PPC32-LABEL: muloti_test: ; PPC32: # %bb.0: # %start -; PPC32-NEXT: stwu 1, -64(1) -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: mfcr 12 -; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 27, 9, 4 -; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill +; PPC32-NEXT: stwu 1, -80(1) ; PPC32-NEXT: mr 11, 7 -; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: li 7, 0 -; PPC32-NEXT: mullw 26, 3, 10 -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 27, 26, 27 -; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 7, 11, 0 -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 24, 11, 6 -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 0, 8, 6 -; PPC32-NEXT: stw 12, 16(1) -; PPC32-NEXT: mr 12, 5 -; PPC32-NEXT: mulhwu 5, 4, 10 -; PPC32-NEXT: addc 5, 5, 27 -; PPC32-NEXT: addze 27, 7 -; PPC32-NEXT: cmpwi 2, 27, 0 -; PPC32-NEXT: mullw 25, 12, 8 -; PPC32-NEXT: add 26, 24, 25 -; PPC32-NEXT: addc 0, 0, 26 -; PPC32-NEXT: addze 26, 7 -; PPC32-NEXT: mullw 23, 8, 6 -; PPC32-NEXT: mullw 22, 4, 10 -; PPC32-NEXT: addc 24, 22, 23 -; PPC32-NEXT: adde 22, 5, 0 -; PPC32-NEXT: mulhwu 29, 6, 10 -; PPC32-NEXT: mullw 21, 12, 10 -; PPC32-NEXT: addc 5, 21, 29 -; PPC32-NEXT: mulhwu 30, 12, 10 -; PPC32-NEXT: addze 0, 30 -; PPC32-NEXT: mullw 23, 6, 9 -; PPC32-NEXT: addc 5, 23, 5 -; PPC32-NEXT: mulhwu 28, 6, 9 -; PPC32-NEXT: addze 29, 28 -; PPC32-NEXT: addc 0, 0, 29 -; PPC32-NEXT: addze 29, 7 -; PPC32-NEXT: mullw 30, 12, 9 -; PPC32-NEXT: addc 0, 30, 0 -; PPC32-NEXT: mulhwu 25, 12, 9 -; PPC32-NEXT: adde 30, 25, 29 -; PPC32-NEXT: addc 0, 0, 24 -; PPC32-NEXT: adde 30, 30, 22 -; PPC32-NEXT: addze. 29, 7 +; PPC32-NEXT: stw 26, 56(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu. 26, 11, 6 +; PPC32-NEXT: stw 24, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 27, 60(1) # 4-byte Folded Spill ; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: mulhwu. 29, 11, 6 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: mulhwu. 29, 12, 8 +; PPC32-NEXT: stw 19, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu 27, 6, 10 +; PPC32-NEXT: stw 20, 32(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 6, 11, 0 +; PPC32-NEXT: stw 21, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: li 7, 0 +; PPC32-NEXT: stw 22, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu. 26, 5, 8 +; PPC32-NEXT: stw 23, 44(1) # 4-byte Folded Spill ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: cmpwi 12, 0 -; PPC32-NEXT: crnor 20, 2, 30 +; PPC32-NEXT: stw 25, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 5, 0 +; PPC32-NEXT: stw 28, 64(1) # 4-byte Folded Spill +; PPC32-NEXT: mullw 24, 5, 10 +; PPC32-NEXT: stw 29, 68(1) # 4-byte Folded Spill +; PPC32-NEXT: crnor 20, 2, 26 +; PPC32-NEXT: stw 30, 72(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 3, 0 -; PPC32-NEXT: cmpwi 7, 9, 0 -; PPC32-NEXT: crnor 24, 30, 2 -; PPC32-NEXT: mulhwu. 12, 3, 10 -; PPC32-NEXT: crorc 20, 20, 26 -; PPC32-NEXT: mcrf 7, 0 +; PPC32-NEXT: stw 12, 24(1) +; PPC32-NEXT: mulhwu 30, 5, 10 +; PPC32-NEXT: cmpwi 6, 9, 0 +; PPC32-NEXT: crnor 21, 26, 2 +; PPC32-NEXT: crorc 20, 20, 6 ; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: cmpwi 26, 0 -; PPC32-NEXT: crorc 28, 20, 2 +; PPC32-NEXT: mulhwu 12, 5, 9 +; PPC32-NEXT: mullw 26, 5, 9 +; PPC32-NEXT: mullw 22, 5, 8 +; PPC32-NEXT: addc 5, 24, 27 +; PPC32-NEXT: addze 30, 30 +; PPC32-NEXT: mullw 23, 6, 9 +; PPC32-NEXT: addc 5, 23, 5 +; PPC32-NEXT: mullw 21, 11, 6 +; PPC32-NEXT: add 27, 21, 22 +; PPC32-NEXT: mulhwu 28, 8, 6 +; PPC32-NEXT: add 27, 28, 27 +; PPC32-NEXT: cmplw 7, 27, 28 +; PPC32-NEXT: mulhwu. 23, 3, 10 +; PPC32-NEXT: mcrf 6, 0 +; PPC32-NEXT: cror 24, 20, 28 +; PPC32-NEXT: crorc 25, 21, 26 +; PPC32-NEXT: mulhwu 0, 6, 9 +; PPC32-NEXT: mullw 20, 9, 4 ; PPC32-NEXT: mulhwu. 9, 9, 4 -; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: crorc 20, 24, 30 +; PPC32-NEXT: mcrf 1, 0 +; PPC32-NEXT: addze 9, 0 +; PPC32-NEXT: mullw 19, 3, 10 ; PPC32-NEXT: or. 3, 4, 3 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: or. 3, 8, 11 -; PPC32-NEXT: crorc 20, 20, 10 -; PPC32-NEXT: crnor 21, 2, 26 +; PPC32-NEXT: mcrf 5, 0 +; PPC32-NEXT: addc 3, 30, 9 +; PPC32-NEXT: add 24, 19, 20 +; PPC32-NEXT: mulhwu 29, 4, 10 +; PPC32-NEXT: add 28, 29, 24 +; PPC32-NEXT: cmplw 2, 28, 29 +; PPC32-NEXT: crorc 20, 25, 6 +; PPC32-NEXT: cror 20, 20, 8 +; PPC32-NEXT: mullw 22, 4, 10 +; PPC32-NEXT: or. 4, 8, 11 +; PPC32-NEXT: addze 4, 7 +; PPC32-NEXT: crnor 21, 2, 22 ; PPC32-NEXT: cror 20, 21, 20 -; PPC32-NEXT: cror 20, 20, 28 -; PPC32-NEXT: crandc 20, 6, 20 +; PPC32-NEXT: mullw 25, 8, 6 +; PPC32-NEXT: addc 8, 26, 3 +; PPC32-NEXT: adde 9, 12, 4 +; PPC32-NEXT: addc 3, 22, 25 +; PPC32-NEXT: adde 11, 28, 27 +; PPC32-NEXT: addc 4, 8, 3 +; PPC32-NEXT: adde 3, 9, 11 +; PPC32-NEXT: cmplw 1, 3, 9 +; PPC32-NEXT: cmplw 4, 8 +; PPC32-NEXT: crandc 22, 4, 6 ; PPC32-NEXT: mullw 6, 6, 10 -; PPC32-NEXT: bc 12, 20, .LBB0_2 +; PPC32-NEXT: bc 12, 22, .LBB0_3 ; PPC32-NEXT: # %bb.1: # %start +; PPC32-NEXT: crand 21, 6, 0 +; PPC32-NEXT: bc 12, 21, .LBB0_3 +; PPC32-NEXT: # %bb.2: # %start +; PPC32-NEXT: cror 20, 20, 24 +; PPC32-NEXT: bc 4, 20, .LBB0_4 +; PPC32-NEXT: .LBB0_3: # %start ; PPC32-NEXT: li 7, 1 -; PPC32-NEXT: .LBB0_2: # %start -; PPC32-NEXT: lwz 12, 16(1) -; PPC32-NEXT: mr 3, 30 -; PPC32-NEXT: mr 4, 0 -; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload +; PPC32-NEXT: .LBB0_4: # %start +; PPC32-NEXT: lwz 12, 24(1) +; PPC32-NEXT: lwz 30, 72(1) # 4-byte Folded Reload ; PPC32-NEXT: mtcrf 32, 12 # cr2 -; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 28, 48(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 27, 44(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 26, 40(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 25, 36(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 24, 32(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; PPC32-NEXT: addi 1, 1, 64 +; PPC32-NEXT: lwz 29, 68(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 28, 64(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 27, 60(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 26, 56(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 25, 52(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 24, 48(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 23, 44(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 22, 40(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 21, 36(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 32(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 19, 28(1) # 4-byte Folded Reload +; PPC32-NEXT: addi 1, 1, 80 ; PPC32-NEXT: blr start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll index 515dd0f70e948..e5c5356ce50a4 100644 --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -207,32 +207,33 @@ define i1 @test_urem_oversized(i66 %X) nounwind { ; PPC: # %bb.0: ; PPC-NEXT: lis 6, -12795 ; PPC-NEXT: ori 6, 6, 40665 -; PPC-NEXT: mulhwu 8, 5, 6 +; PPC-NEXT: mulhwu 7, 5, 6 ; PPC-NEXT: lis 9, 12057 ; PPC-NEXT: ori 9, 9, 37186 ; PPC-NEXT: mullw 11, 4, 6 -; PPC-NEXT: addc 8, 11, 8 +; PPC-NEXT: addc 7, 11, 7 ; PPC-NEXT: lis 11, -5526 ; PPC-NEXT: ori 11, 11, 61135 -; PPC-NEXT: mulhwu 7, 4, 6 -; PPC-NEXT: addze 7, 7 +; PPC-NEXT: mulhwu 8, 4, 6 +; PPC-NEXT: addze 8, 8 ; PPC-NEXT: mulhwu 10, 5, 9 ; PPC-NEXT: mullw 4, 4, 9 ; PPC-NEXT: mullw 9, 5, 9 -; PPC-NEXT: addc 8, 9, 8 -; PPC-NEXT: adde 7, 7, 10 -; PPC-NEXT: add 4, 4, 7 -; PPC-NEXT: rotlwi 9, 8, 31 +; PPC-NEXT: addc 7, 9, 7 +; PPC-NEXT: addze 9, 10 +; PPC-NEXT: rotlwi 10, 7, 31 ; PPC-NEXT: mullw 3, 3, 6 ; PPC-NEXT: mullw 6, 5, 6 ; PPC-NEXT: slwi 5, 5, 1 ; PPC-NEXT: add 3, 5, 3 ; PPC-NEXT: rotlwi 5, 6, 31 +; PPC-NEXT: rlwimi 5, 7, 31, 0, 0 +; PPC-NEXT: add 7, 8, 9 +; PPC-NEXT: add 4, 4, 7 ; PPC-NEXT: add 3, 4, 3 -; PPC-NEXT: rlwimi 5, 8, 31, 0, 0 -; PPC-NEXT: rlwimi 9, 3, 31, 0, 0 +; PPC-NEXT: rlwimi 10, 3, 31, 0, 0 ; PPC-NEXT: cmplw 5, 11 -; PPC-NEXT: cmplwi 1, 9, 13 +; PPC-NEXT: cmplwi 1, 10, 13 ; PPC-NEXT: rlwinm 3, 3, 31, 31, 31 ; PPC-NEXT: crandc 20, 4, 6 ; PPC-NEXT: crand 21, 6, 0 From d804c838933b1f35ae56343afac669ffe3bbd957 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 19 Feb 2025 14:49:22 +0000 Subject: [PATCH 059/220] [clang] Lower modf builtin using `llvm.modf` intrinsic (#126750) This updates the existing `modf[f|l]` builtin to be lowered via the `llvm.modf.*` intrinsic (rather than directly to a library call). --- clang/lib/CodeGen/CGBuiltin.cpp | 27 ++++++++++++++++++++ clang/test/CodeGen/X86/math-builtins.c | 32 ++++++++++++++++++------ clang/test/CodeGen/aix-builtin-mapping.c | 2 +- clang/test/CodeGen/builtin-attributes.c | 11 +++++--- clang/test/CodeGen/math-builtins-long.c | 6 ++--- clang/test/CodeGen/math-libcalls.c | 12 ++++----- 6 files changed, 70 insertions(+), 20 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 0bf8c845b307f..4688381040be2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -859,6 +859,24 @@ static void emitSincosBuiltin(CodeGenFunction &CGF, const CallExpr *E, StoreCos->setMetadata(LLVMContext::MD_noalias, AliasScopeList); } +static llvm::Value *emitModfBuiltin(CodeGenFunction &CGF, const CallExpr *E, + llvm::Intrinsic::ID IntrinsicID) { + llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *IntPartDest = CGF.EmitScalarExpr(E->getArg(1)); + + llvm::Value *Call = + CGF.Builder.CreateIntrinsic(IntrinsicID, {Val->getType()}, Val); + + llvm::Value *FractionalResult = CGF.Builder.CreateExtractValue(Call, 0); + llvm::Value *IntegralResult = CGF.Builder.CreateExtractValue(Call, 1); + + QualType DestPtrType = E->getArg(1)->getType()->getPointeeType(); + LValue IntegralLV = CGF.MakeNaturalAlignAddrLValue(IntPartDest, DestPtrType); + CGF.EmitStoreOfScalar(IntegralResult, IntegralLV); + + return FractionalResult; +} + /// EmitFAbs - Emit a call to @llvm.fabs(). static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) { Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType()); @@ -4112,6 +4130,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_frexpf128: case Builtin::BI__builtin_frexpf16: return RValue::get(emitFrexpBuiltin(*this, E, Intrinsic::frexp)); + case Builtin::BImodf: + case Builtin::BImodff: + case Builtin::BImodfl: + case Builtin::BI__builtin_modf: + case Builtin::BI__builtin_modff: + case Builtin::BI__builtin_modfl: + if (Builder.getIsFPConstrained()) + break; // TODO: Emit constrained modf intrinsic once one exists. + return RValue::get(emitModfBuiltin(*this, E, Intrinsic::modf)); case Builtin::BI__builtin_isgreater: case Builtin::BI__builtin_isgreaterequal: case Builtin::BI__builtin_isless: diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index d7bf7d57fba26..d5301b7bafd9c 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -38,6 +38,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO-NEXT: [[FREXP_F128_0:%.+]] = extractvalue { fp128, i32 } [[FREXP_F128]], 0 +// NO__ERRNO: [[MODF_F64:%.+]] = call { double, double } @llvm.modf.f64(double %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F64_FP:%.+]] = extractvalue { double, double } [[MODF_F64]], 0 +// NO__ERRNO-NEXT: [[MODF_F64_IP:%.+]] = extractvalue { double, double } [[MODF_F64]], 1 +// NO__ERRNO-NEXT: store double [[MODF_F64_IP]], ptr %{{.+}}, align 8 + +// NO__ERRNO: [[MODF_F32:%.+]] = call { float, float } @llvm.modf.f32(float %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F32_FP:%.+]] = extractvalue { float, float } [[MODF_F32]], 0 +// NO__ERRNO-NEXT: [[MODF_F32_IP:%.+]] = extractvalue { float, float } [[MODF_F32]], 1 +// NO__ERRNO-NEXT: store float [[MODF_F32_IP]], ptr %{{.+}}, align 4 + +// NO__ERRNO: [[MODF_F80:%.+]] = call { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80 %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F80_FP:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[MODF_F80]], 0 +// NO__ERRNO-NEXT: [[MODF_F80_IP:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[MODF_F80]], 1 +// NO__ERRNO-NEXT: store x86_fp80 [[MODF_F80_IP]], ptr %{{.+}}, align 16 + +// NO__ERRNO: call fp128 @modff128(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) + + // NO__ERRNO: [[SINCOS_F64:%.+]] = call { double, double } @llvm.sincos.f64(double %{{.+}}) // NO__ERRNO-NEXT: [[SINCOS_F64_0:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 0 // NO__ERRNO-NEXT: [[SINCOS_F64_1:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 1 @@ -139,13 +157,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_modf(f,d); __builtin_modff(f,fp); __builtin_modfl(f,l); __builtin_modff128(f,l); -// NO__ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] -// NO__ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] -// NO__ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] -// NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] +// NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] +// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] __builtin_nan(c); __builtin_nanf(c); __builtin_nanl(c); __builtin_nanf128(c); diff --git a/clang/test/CodeGen/aix-builtin-mapping.c b/clang/test/CodeGen/aix-builtin-mapping.c index a79218c6f1d8b..cc1cc1a44f32c 100644 --- a/clang/test/CodeGen/aix-builtin-mapping.c +++ b/clang/test/CodeGen/aix-builtin-mapping.c @@ -17,6 +17,6 @@ int main() returnValue = __builtin_ldexpl(1.0L, 1); } -// CHECK: %call = call double @modf(double noundef 1.000000e+00, ptr noundef %DummyLongDouble) #3 +// CHECK: %{{.+}} = call { double, double } @llvm.modf.f64(double 1.000000e+00) // CHECK: %{{.+}} = call { double, i32 } @llvm.frexp.f64.i32(double 0.000000e+00) // CHECK: %{{.+}} = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 1) diff --git a/clang/test/CodeGen/builtin-attributes.c b/clang/test/CodeGen/builtin-attributes.c index e5b0faccfd23f..506b165fcf36e 100644 --- a/clang/test/CodeGen/builtin-attributes.c +++ b/clang/test/CodeGen/builtin-attributes.c @@ -24,6 +24,11 @@ char* f2(char* a, char* b) { return __builtin_strstr(a, b); } +// Note: Use asm label to disable intrinsic lowering of modf. +double modf(double x, double*) asm("modf"); +float modff(float x, float*) asm("modff"); +long double modfl(long double x, long double*) asm("modfl"); + // frexp is NOT readnone. It writes to its pointer argument. // // CHECK: f3 @@ -55,9 +60,9 @@ int f3(double x) { frexp(x, &e); frexpf(x, &e); frexpl(x, &e); - __builtin_modf(x, &e); - __builtin_modff(x, &e); - __builtin_modfl(x, &e); + modf(x, &e); + modff(x, &e); + modfl(x, &e); __builtin_remquo(x, x, &e); __builtin_remquof(x, x, &e); __builtin_remquol(x, x, &e); diff --git a/clang/test/CodeGen/math-builtins-long.c b/clang/test/CodeGen/math-builtins-long.c index 183349e0f0173..87e64a2eaa1c3 100644 --- a/clang/test/CodeGen/math-builtins-long.c +++ b/clang/test/CodeGen/math-builtins-long.c @@ -58,9 +58,9 @@ void foo(long double f, long double *l, int *i, const char *c) { // PPCF128: call fp128 @ldexpf128(fp128 noundef %{{.+}}, {{(signext)?.+}}) __builtin_ldexpl(f,f); - // F80: call x86_fp80 @modfl(x86_fp80 noundef %{{.+}}, ptr noundef %{{.+}}) - // PPC: call ppc_fp128 @modfl(ppc_fp128 noundef %{{.+}}, ptr noundef %{{.+}}) - // X86F128: call fp128 @modfl(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) + // F80: call { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80 %{{.+}}) + // PPC: call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %{{.+}}) + // X86F128: call { fp128, fp128 } @llvm.modf.f128(fp128 %{{.+}}) // PPCF128: call fp128 @modff128(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) __builtin_modfl(f,l); diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index 14fdee77f4d78..ad297828f48ed 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -83,12 +83,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { modf(f,d); modff(f,fp); modfl(f,l); - // NO__ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] - // NO__ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] - // NO__ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] + // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] // HAS_MAYTRAP: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] // HAS_MAYTRAP: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] From 2b340c10a611d929fee25e6222909c8915e3d6b6 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 19 Feb 2025 06:53:30 -0800 Subject: [PATCH 060/220] flang: Fix build with latest libc++ (#127362) I think this first stopped working with 954836634abb446f18719b14120c386a929a42d1. This patch fixes the following error: /home/runner/work/llvm-project/llvm-project/flang/runtime/io-api-minimal.cpp:153:11: error: '__libcpp_verbose_abort' is missing exception specification 'noexcept' 153 | void std::__libcpp_verbose_abort(char const *format, ...) { | ^ | noexcept /mnt/build/bin/../include/c++/v1/__verbose_abort:30:28: note: previous declaration is here 30 | __printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _LIBCPP_VERBOSE_ABORT_NOEXCEPT; | ^ 1 error generated. --- flang-rt/lib/runtime/io-api-minimal.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flang-rt/lib/runtime/io-api-minimal.cpp b/flang-rt/lib/runtime/io-api-minimal.cpp index 8d8c9c6070b04..fdf7183ed5176 100644 --- a/flang-rt/lib/runtime/io-api-minimal.cpp +++ b/flang-rt/lib/runtime/io-api-minimal.cpp @@ -150,7 +150,8 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) { // Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency // on the version provided by libc++. -void std::__libcpp_verbose_abort(char const *format, ...) { +void std::__libcpp_verbose_abort(char const *format, ...) noexcept( + noexcept(std::__libcpp_verbose_abort(""))) { va_list list; va_start(list, format); std::vfprintf(stderr, format, list); From d6c6bde9dbcf332b5092ebcee8c7fe6fbb5aa2ae Mon Sep 17 00:00:00 2001 From: Jean-Didier PAILLEUX Date: Wed, 19 Feb 2025 16:00:09 +0100 Subject: [PATCH 061/220] [flang] Implement !DIR$ UNROLL_AND_JAM [N] (#125046) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements support for the UNROLL_AND_JAM directive to enable or disable unrolling and jamming on a `DO LOOP`. It must be placed immediately before a `DO LOOP` and applies only to the loop that follows. N is an integer that specifying the unrolling factor. This is done by adding an attribute to the branch into the loop in LLVM to indicate that the loop should unrolled and jammed. --- flang/docs/Directives.md | 5 ++ flang/include/flang/Parser/dump-parse-tree.h | 1 + flang/include/flang/Parser/parse-tree.h | 7 ++- flang/lib/Lower/Bridge.cpp | 39 ++++++++++++++- flang/lib/Parser/Fortran-parsers.cpp | 3 ++ flang/lib/Parser/unparse.cpp | 4 ++ .../lib/Semantics/canonicalize-directives.cpp | 6 ++- flang/lib/Semantics/resolve-names.cpp | 3 +- flang/test/Integration/unroll_and_jam.f90 | 48 +++++++++++++++++++ flang/test/Lower/unroll_and_jam.f90 | 34 +++++++++++++ flang/test/Parser/compiler-directives.f90 | 11 +++++ 11 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 flang/test/Integration/unroll_and_jam.f90 create mode 100644 flang/test/Lower/unroll_and_jam.f90 diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index c6c2e29a420ea..5e76d4331f6de 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -45,6 +45,11 @@ A list of non-standard directives supported by Flang times if possible. When `n` is omitted, the compiler should attempt to fully unroll the loop. Some compilers accept an optional `=` before the `n` when `n` is present in the directive. Flang does not. +* `!dir$ unroll_and_jam [N]` control how many times a loop should be unrolled and + jammed. It must be placed immediately before a loop that follows. `N` is an optional + integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop + should not be unrolled at all. If `N` is omitted the optimizer will + selects the number of times to unroll the loop. # Directive Details diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 21ee1d0517840..75c11301285b3 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -209,6 +209,7 @@ class ParseTreeDumper { NODE(CompilerDirective, Unrecognized) NODE(CompilerDirective, VectorAlways) NODE(CompilerDirective, Unroll) + NODE(CompilerDirective, UnrollAndJam) NODE(parser, ComplexLiteralConstant) NODE(parser, ComplexPart) NODE(parser, ComponentArraySpec) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 6ba43f6688c25..c2fa9a2228180 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -3349,6 +3349,8 @@ struct StmtFunctionStmt { // !DIR$ IGNORE_TKR [ [(tkrdmac...)] name ]... // !DIR$ LOOP COUNT (n1[, n2]...) // !DIR$ name[=value] [, name[=value]]... = can be : +// !DIR$ UNROLL [N] +// !DIR$ UNROLL_AND_JAM [N] // !DIR$ struct CompilerDirective { UNION_CLASS_BOILERPLATE(CompilerDirective); @@ -3371,10 +3373,13 @@ struct CompilerDirective { struct Unroll { WRAPPER_CLASS_BOILERPLATE(Unroll, std::optional); }; + struct UnrollAndJam { + WRAPPER_CLASS_BOILERPLATE(UnrollAndJam, std::optional); + }; EMPTY_CLASS(Unrecognized); CharBlock source; std::variant, LoopCount, std::list, - VectorAlways, std::list, Unroll, Unrecognized> + VectorAlways, std::list, Unroll, UnrollAndJam, Unrecognized> u; }; diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 7c217ce2f404c..1b24ed12e04f1 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2205,11 +2205,39 @@ class FirConverter : public Fortran::lower::AbstractConverter { /*full=*/fullUnrollAttr, {}, {}, {}); } + // Enabling unroll and jamming directive without a value. + // For directives with a value, if the value is greater than 1, + // force unrolling with the given factor. Otherwise, disable unrolling and + // jamming. + mlir::LLVM::LoopUnrollAndJamAttr + genLoopUnrollAndJamAttr(std::optional count) { + mlir::BoolAttr falseAttr = + mlir::BoolAttr::get(builder->getContext(), false); + mlir::BoolAttr trueAttr = mlir::BoolAttr::get(builder->getContext(), true); + mlir::IntegerAttr countAttr; + bool shouldUnroll = true; + if (count.has_value()) { + auto unrollingFactor = count.value(); + if (unrollingFactor == 0 || unrollingFactor == 1) { + shouldUnroll = false; + } else { + countAttr = + builder->getIntegerAttr(builder->getI64Type(), unrollingFactor); + } + } + + mlir::BoolAttr disableAttr = shouldUnroll ? falseAttr : trueAttr; + return mlir::LLVM::LoopUnrollAndJamAttr::get( + builder->getContext(), /*disable=*/disableAttr, /*count*/ countAttr, {}, + {}, {}, {}, {}); + } + void addLoopAnnotationAttr( IncrementLoopInfo &info, llvm::SmallVectorImpl &dirs) { mlir::LLVM::LoopVectorizeAttr va; mlir::LLVM::LoopUnrollAttr ua; + mlir::LLVM::LoopUnrollAndJamAttr uja; bool has_attrs = false; for (const auto *dir : dirs) { Fortran::common::visit( @@ -2226,12 +2254,16 @@ class FirConverter : public Fortran::lower::AbstractConverter { ua = genLoopUnrollAttr(u.v); has_attrs = true; }, + [&](const Fortran::parser::CompilerDirective::UnrollAndJam &u) { + uja = genLoopUnrollAndJamAttr(u.v); + has_attrs = true; + }, [&](const auto &) {}}, dir->u); } mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get( - builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}); + builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, + /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}); if (has_attrs) info.doLoop.setLoopAnnotationAttr(la); } @@ -2887,6 +2919,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { [&](const Fortran::parser::CompilerDirective::Unroll &) { attachDirectiveToLoop(dir, &eval); }, + [&](const Fortran::parser::CompilerDirective::UnrollAndJam &) { + attachDirectiveToLoop(dir, &eval); + }, [&](const auto &) {}}, dir.u); } diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp index b5bcb53a12761..cfe9ecb29b0b7 100644 --- a/flang/lib/Parser/Fortran-parsers.cpp +++ b/flang/lib/Parser/Fortran-parsers.cpp @@ -1308,11 +1308,14 @@ constexpr auto vectorAlways{ "VECTOR ALWAYS" >> construct()}; constexpr auto unroll{ "UNROLL" >> construct(maybe(digitString64))}; +constexpr auto unrollAndJam{"UNROLL_AND_JAM" >> + construct(maybe(digitString64))}; TYPE_PARSER(beginDirective >> "DIR$ "_tok >> sourced((construct(ignore_tkr) || construct(loopCount) || construct(assumeAligned) || construct(vectorAlways) || + construct(unrollAndJam) || construct(unroll) || construct( many(construct( diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 3d00979d7b7a6..6260a01897527 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -1851,6 +1851,10 @@ class UnparseVisitor { Word("!DIR$ UNROLL"); Walk(" ", unroll.v); }, + [&](const CompilerDirective::UnrollAndJam &unrollAndJam) { + Word("!DIR$ UNROLL_AND_JAM"); + Walk(" ", unrollAndJam.v); + }, [&](const CompilerDirective::Unrecognized &) { Word("!DIR$ "); Word(x.source.ToString()); diff --git a/flang/lib/Semantics/canonicalize-directives.cpp b/flang/lib/Semantics/canonicalize-directives.cpp index b27a27618808b..1a0a0d145b3e2 100644 --- a/flang/lib/Semantics/canonicalize-directives.cpp +++ b/flang/lib/Semantics/canonicalize-directives.cpp @@ -56,7 +56,8 @@ bool CanonicalizeDirectives( static bool IsExecutionDirective(const parser::CompilerDirective &dir) { return std::holds_alternative( dir.u) || - std::holds_alternative(dir.u); + std::holds_alternative(dir.u) || + std::holds_alternative(dir.u); } void CanonicalizationOfDirectives::Post(parser::SpecificationPart &spec) { @@ -115,6 +116,9 @@ void CanonicalizationOfDirectives::Post(parser::Block &block) { [&](parser::CompilerDirective::Unroll &) { CheckLoopDirective(*dir, block, it); }, + [&](parser::CompilerDirective::UnrollAndJam &) { + CheckLoopDirective(*dir, block, it); + }, [&](auto &) {}}, dir->u); } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index ff793658f1e06..17a6665dfb6a5 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -9552,7 +9552,8 @@ void ResolveNamesVisitor::Post(const parser::AssignedGotoStmt &x) { void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) { if (std::holds_alternative(x.u) || - std::holds_alternative(x.u)) { + std::holds_alternative(x.u) || + std::holds_alternative(x.u)) { return; } if (const auto *tkr{ diff --git a/flang/test/Integration/unroll_and_jam.f90 b/flang/test/Integration/unroll_and_jam.f90 new file mode 100644 index 0000000000000..771b7fb411855 --- /dev/null +++ b/flang/test/Integration/unroll_and_jam.f90 @@ -0,0 +1,48 @@ +! RUN: %flang_fc1 -emit-llvm -o - %s | FileCheck %s + +! CHECK-LABEL: unroll_and_jam_dir +subroutine unroll_and_jam_dir + integer :: a(10) + !dir$ unroll_and_jam 4 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir + +! CHECK-LABEL: unroll_and_jam_dir_0 +subroutine unroll_and_jam_dir_0 + integer :: a(10) + !dir$ unroll_and_jam 0 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_0 + +! CHECK-LABEL: unroll_and_jam_dir_1 +subroutine unroll_and_jam_dir_1 + integer :: a(10) + !dir$ unroll_and_jam 1 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_1 + +! CHECK-LABEL: unroll_and_jam_dir_no_factor +subroutine unroll_and_jam_dir_no_factor + integer :: a(10) + !dir$ unroll_and_jam + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_NO_FACTOR:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_no_factor + +! CHECK: ![[ANNOTATION]] = distinct !{![[ANNOTATION]], ![[UNROLL_AND_JAM:.*]], ![[UNROLL_AND_JAM_COUNT:.*]]} +! CHECK: ![[UNROLL_AND_JAM]] = !{!"llvm.loop.unroll_and_jam.enable"} +! CHECK: ![[UNROLL_AND_JAM_COUNT]] = !{!"llvm.loop.unroll_and_jam.count", i32 4} +! CHECK: ![[ANNOTATION_DISABLE]] = distinct !{![[ANNOTATION_DISABLE]], ![[UNROLL_AND_JAM2:.*]]} +! CHECK: ![[UNROLL_AND_JAM2]] = !{!"llvm.loop.unroll_and_jam.disable"} +! CHECK: ![[ANNOTATION_NO_FACTOR]] = distinct !{![[ANNOTATION_NO_FACTOR]], ![[UNROLL_AND_JAM]]} diff --git a/flang/test/Lower/unroll_and_jam.f90 b/flang/test/Lower/unroll_and_jam.f90 new file mode 100644 index 0000000000000..afc5a7b6b271e --- /dev/null +++ b/flang/test/Lower/unroll_and_jam.f90 @@ -0,0 +1,34 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s + +! CHECK: #loop_unroll_and_jam = #llvm.loop_unroll_and_jam +! CHECK: #loop_unroll_and_jam1 = #llvm.loop_unroll_and_jam +! CHECK: #loop_annotation = #llvm.loop_annotation +! CHECK: #loop_annotation1 = #llvm.loop_annotation + +! CHECK-LABEL: unroll_and_jam_dir +subroutine unroll_and_jam_dir + integer :: a(10) + !dir$ unroll_and_jam + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation} + do i=1,10 + a(i)=i + end do + + !dir$ unroll_and_jam 2 + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation1} + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir + + +! CHECK-LABEL: intermediate_directive +subroutine intermediate_directive + integer :: a(10) + !dir$ unroll_and_jam + !dir$ unknown + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation} + do i=1,10 + a(i)=i + end do +end subroutine intermediate_directive diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90 index f372a9f533a35..d1e386a01dd4d 100644 --- a/flang/test/Parser/compiler-directives.f90 +++ b/flang/test/Parser/compiler-directives.f90 @@ -46,3 +46,14 @@ subroutine unroll do i=1,10 enddo end subroutine + +subroutine unroll_and_jam + !dir$ unroll_and_jam + ! CHECK: !DIR$ UNROLL_AND_JAM + do i=1,10 + enddo + !dir$ unroll_and_jam 2 + ! CHECK: !DIR$ UNROLL_AND_JAM 2 + do i=1,10 + enddo +end subroutine From 0b63dfb06698ea1a78ba09506f83a1d427a868b1 Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Wed, 19 Feb 2025 09:05:25 -0600 Subject: [PATCH 062/220] [MLIR][NFC] Use base alias for constructor inheritance (#127756) During my previous cleanup (#127403), I did not notice that we defined a type alias for the base class. This type alias allows us to use the shorter form Base::Base, and this PR switches to that. --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 +- mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp | 2 +- mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp | 2 +- .../Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp | 2 +- mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp | 2 +- mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp | 2 +- mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp | 2 +- mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp | 2 +- mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 36fbdbed4ae2f..b29228ef87ea7 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1038,7 +1038,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { - using ConvertAMDGPUToROCDLPassBase::ConvertAMDGPUToROCDLPassBase; + using Base::Base; void runOnOperation() override { MLIRContext *ctx = &getContext(); diff --git a/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp b/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp index 5887e37b7f0b4..1f2781aa82114 100644 --- a/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp +++ b/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp @@ -1338,7 +1338,7 @@ void mlir::arith::populateArithToSPIRVPatterns( namespace { struct ConvertArithToSPIRVPass : public impl::ConvertArithToSPIRVPassBase { - using ConvertArithToSPIRVPassBase::ConvertArithToSPIRVPassBase; + using Base::Base; void runOnOperation() override { Operation *op = getOperation(); diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index ea25d5afaeeca..5089179435f1e 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -1072,7 +1072,7 @@ namespace { struct ConvertComplexToStandardPass : public impl::ConvertComplexToStandardPassBase< ConvertComplexToStandardPass> { - using ConvertComplexToStandardPassBase::ConvertComplexToStandardPassBase; + using Base::Base; void runOnOperation() override; }; diff --git a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp index a0ae39a353a95..03f4bf4df4912 100644 --- a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp +++ b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp @@ -28,7 +28,7 @@ namespace { class ConvertControlFlowToSPIRVPass final : public impl::ConvertControlFlowToSPIRVPassBase< ConvertControlFlowToSPIRVPass> { - using ConvertControlFlowToSPIRVPassBase::ConvertControlFlowToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp b/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp index 572a432d6d641..8ed9f659afb10 100644 --- a/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp +++ b/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp @@ -27,7 +27,7 @@ namespace { /// A pass converting MLIR Func operations into the SPIR-V dialect. class ConvertFuncToSPIRVPass : public impl::ConvertFuncToSPIRVPassBase { - using ConvertFuncToSPIRVPassBase::ConvertFuncToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp index 5d53aef199d52..b06ab44d159af 100644 --- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp +++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp @@ -27,7 +27,7 @@ namespace { /// A pass converting MLIR MemRef operations into the SPIR-V dialect. class ConvertMemRefToSPIRVPass : public impl::ConvertMemRefToSPIRVPassBase { - using ConvertMemRefToSPIRVPassBase::ConvertMemRefToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp index 8e2efbc7f4280..99631705851fd 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp @@ -34,7 +34,7 @@ namespace { // walk the function recursively to avoid considering nested loops. struct ForLoopMapper : public impl::ConvertAffineForToGPUPassBase { - using ConvertAffineForToGPUPassBase::ConvertAffineForToGPUPassBase; + using Base::Base; void runOnOperation() override { for (Operation &op : llvm::make_early_inc_range( diff --git a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp index 9e98dc7d7aaf6..f07386ea80124 100644 --- a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp +++ b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp @@ -29,7 +29,7 @@ namespace { /// A pass converting MLIR Tensor operations into the SPIR-V dialect. class ConvertTensorToSPIRVPass : public impl::ConvertTensorToSPIRVPassBase { - using ConvertTensorToSPIRVPassBase::ConvertTensorToSPIRVPassBase; + using Base::Base; void runOnOperation() override { MLIRContext *context = &getContext(); diff --git a/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp b/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp index 15ddd3f5c16f1..ede3c9e0040fd 100644 --- a/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp +++ b/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp @@ -30,7 +30,7 @@ using namespace tosa; namespace { struct TosaToArith : public impl::TosaToArithPassBase { - using TosaToArithPassBase::TosaToArithPassBase; + using Base::Base; void runOnOperation() override { RewritePatternSet patterns(&getContext()); From fb5a87e1a6febb2a81fd85f800e78c2e6dff5715 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 19 Feb 2025 15:17:58 +0000 Subject: [PATCH 063/220] [libclc][NFC] Reformat ep_log.cl --- libclc/generic/lib/math/ep_log.cl | 101 +++++++++++++++--------------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl index 90c9fa426fec1..65db94a85b9b4 100644 --- a/libclc/generic/lib/math/ep_log.cl +++ b/libclc/generic/lib/math/ep_log.cl @@ -38,57 +38,56 @@ #define LF1 1.24999999978138668903e-02 #define LF2 2.23219810758559851206e-03 -_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2) -{ - // Computes natural log(x). Algorithm based on: - // Ping-Tak Peter Tang - // "Table-driven implementation of the logarithm function in IEEE - // floating-point arithmetic" - // ACM Transactions on Mathematical Software (TOMS) - // Volume 16, Issue 4 (December 1990) - int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; - - ulong ux = as_ulong(x); - ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); - int c = ux < IMPBIT_DP64; - ux = c ? uxs : ux; - int expadjust = c ? 60 : 0; - - // Store the exponent of x in xexp and put f into the range [0.5,1) - int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; - double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); - *xexp = near_one ? 0 : xexp1; - - double r = x - 1.0; - double u1 = MATH_DIVIDE(r, 2.0 + r); - double ru1 = -r * u1; - u1 = u1 + u1; - - int index = as_int2(ux).hi >> 13; - index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); - - double f1 = index * 0x1.0p-7; - double f2 = f - f1; - double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); - - double2 tv = USE_TABLE(ln_tbl, (index - 64)); - double z1 = tv.s0; - double q = tv.s1; - - z1 = near_one ? r : z1; - q = near_one ? 0.0 : q; - double u = near_one ? u1 : u2; - double v = u*u; - - double cc = near_one ? ru1 : u2; - - double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); - double z22 = fma(v, fma(v, LF2, LF1), LF0); - double z2 = near_one ? z21 : z22; - z2 = fma(u*v, z2, cc) + q; - - *r1 = z1; - *r2 = z2; +_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2) { + // Computes natural log(x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; + + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + // Store the exponent of x in xexp and put f into the range [0.5,1) + int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + *xexp = near_one ? 0 : xexp1; + + double r = x - 1.0; + double u1 = MATH_DIVIDE(r, 2.0 + r); + double ru1 = -r * u1; + u1 = u1 + u1; + + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + + double2 tv = USE_TABLE(ln_tbl, (index - 64)); + double z1 = tv.s0; + double q = tv.s1; + + z1 = near_one ? r : z1; + q = near_one ? 0.0 : q; + double u = near_one ? u1 : u2; + double v = u * u; + + double cc = near_one ? ru1 : u2; + + double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); + double z22 = fma(v, fma(v, LF2, LF1), LF0); + double z2 = near_one ? z21 : z22; + z2 = fma(u * v, z2, cc) + q; + + *r1 = z1; + *r2 = z2; } #endif From 26a83994176fcdca6e77be4f221a15f561681621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?= Date: Wed, 19 Feb 2025 16:19:31 +0100 Subject: [PATCH 064/220] [clang][Sema] Fix initialization of `NonTypeTemplateParmDecl`... (#121768) ...when there are invalid constraints. When attaching a `TypeConstraint`, in case of error, the trailing pointer that is supposed to point to the constraint is left uninitialized. Sometimes the uninitialized value will be a `nullptr`, but at other times it will not. If we traverse the AST (for instance, dumping it, or when writing the BMI), we may get a crash depending on the value that was left. The serialization may also contain a bogus value. In this commit, we always initialize the `PlaceholderTypeConstraint` with `nullptr`, to avoid accessing this uninitialized memory. This does not affect only modules, but it causes a segfault more consistently when they are involved. The test case was reduced from `mp-units`. --------- Co-authored-by: Erich Keane --- clang/lib/AST/DeclTemplate.cpp | 45 +++++++++------ clang/lib/Serialization/ASTWriterDecl.cpp | 8 +-- ...constraint-template-non-type-parm-decl.cpp | 55 +++++++++++++++++++ 3 files changed, 88 insertions(+), 20 deletions(-) create mode 100644 clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 7fb89bf5b499f..63caf04f7ef38 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -786,12 +786,16 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( QualType T, bool ParameterPack, TypeSourceInfo *TInfo) { AutoType *AT = C.getLangOpts().CPlusPlus20 ? T->getContainedAutoType() : nullptr; - return new (C, DC, - additionalSizeToAlloc, - Expr *>(0, - AT && AT->isConstrained() ? 1 : 0)) - NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, ParameterPack, - TInfo); + const bool HasConstraint = AT && AT->isConstrained(); + auto *NTTP = + new (C, DC, + additionalSizeToAlloc, Expr *>( + 0, HasConstraint ? 1 : 0)) + NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, + ParameterPack, TInfo); + if (HasConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( @@ -800,23 +804,30 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( QualType T, TypeSourceInfo *TInfo, ArrayRef ExpandedTypes, ArrayRef ExpandedTInfos) { AutoType *AT = TInfo->getType()->getContainedAutoType(); - return new (C, DC, - additionalSizeToAlloc, - Expr *>( - ExpandedTypes.size(), AT && AT->isConstrained() ? 1 : 0)) - NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo, - ExpandedTypes, ExpandedTInfos); + const bool HasConstraint = AT && AT->isConstrained(); + auto *NTTP = + new (C, DC, + additionalSizeToAlloc, Expr *>( + ExpandedTypes.size(), HasConstraint ? 1 : 0)) + NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo, + ExpandedTypes, ExpandedTInfos); + if (HasConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl * NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, bool HasTypeConstraint) { - return new (C, ID, additionalSizeToAlloc, - Expr *>(0, - HasTypeConstraint ? 1 : 0)) + auto *NTTP = + new (C, ID, + additionalSizeToAlloc, Expr *>( + 0, HasTypeConstraint ? 1 : 0)) NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0, nullptr, QualType(), false, nullptr); + if (HasTypeConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl * @@ -830,6 +841,8 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0, nullptr, QualType(), nullptr, {}, {}); NTTP->NumExpandedTypes = NumExpandedTypes; + if (HasTypeConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); return NTTP; } diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index b25dadab656b0..ac80bb46afa2d 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2016,8 +2016,7 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { // For an expanded parameter pack, record the number of expansion types here // so that it's easier for deserialization to allocate the right amount of // memory. - Expr *TypeConstraint = D->getPlaceholderTypeConstraint(); - Record.push_back(!!TypeConstraint); + Record.push_back(D->hasPlaceholderTypeConstraint()); if (D->isExpandedParameterPack()) Record.push_back(D->getNumExpansionTypes()); @@ -2025,8 +2024,9 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { // TemplateParmPosition. Record.push_back(D->getDepth()); Record.push_back(D->getPosition()); - if (TypeConstraint) - Record.AddStmt(TypeConstraint); + + if (D->hasPlaceholderTypeConstraint()) + Record.AddStmt(D->getPlaceholderTypeConstraint()); if (D->isExpandedParameterPack()) { for (unsigned I = 0, N = D->getNumExpansionTypes(); I != N; ++I) { diff --git a/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp b/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp new file mode 100644 index 0000000000000..73dff88e506b4 --- /dev/null +++ b/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp @@ -0,0 +1,55 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t + +// RUN: %clang_cc1 -std=c++20 mod.cppm -emit-module-interface -o mod.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++20 main.cpp -fmodule-file=mod=mod.pcm -verify -fallow-pcm-with-compiler-errors -fsyntax-only -ast-dump-all | FileCheck %s + +// RUN: %clang_cc1 -std=c++20 mod.cppm -emit-reduced-module-interface -o mod.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++20 main.cpp -fmodule-file=mod=mod.pcm -verify -fallow-pcm-with-compiler-errors -fsyntax-only -ast-dump-all | FileCheck %s + +//--- mod.cppm +export module mod; + +template +concept ReferenceOf = Q; + +// expected-error@+2 {{unknown type name 'AngleIsInvalidNow'}} +// expected-error@+1 {{constexpr variable 'angle' must be initialized by a constant expression}} +constexpr struct angle {AngleIsInvalidNow e;} angle; + +// expected-error@+1 {{non-type template argument is not a constant expression}} +template auto R, typename Rep> requires requires(Rep v) {cos(v);} +auto cos(const Rep& q); + +// expected-error@+1 {{non-type template argument is not a constant expression}} +template auto R, typename Rep> requires requires(Rep v) {tan(v);} +auto tan(const Rep& q); + +//--- main.cpp +// expected-no-diagnostics +import mod; + +// CHECK: |-FunctionTemplateDecl {{.*}} col:6 imported in mod hidden invalid cos +// CHECK-NEXT: | |-NonTypeTemplateParmDecl {{.*}} col:34 imported in mod hidden referenced invalid 'ReferenceOf auto' depth 0 index 0 R +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} col:46 imported in mod hidden referenced typename depth 0 index 1 Rep +// CHECK-NEXT: | |-RequiresExpr {{.*}} 'bool' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} col:73 imported in mod hidden referenced v 'Rep' +// CHECK-NEXT: | | `-SimpleRequirement {{.*}} dependent +// CHECK-NEXT: | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | |-UnresolvedLookupExpr {{.*}} '' lvalue (ADL) = 'cos' empty +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'Rep' lvalue ParmVar {{.*}} 'v' 'Rep' non_odr_use_unevaluated +// CHECK-NEXT: | `-FunctionDecl {{.*}} col:6 imported in mod hidden cos 'auto (const Rep &)' +// CHECK-NEXT: | `-ParmVarDecl {{.*}} col:21 imported in mod hidden q 'const Rep &' + +// CHECK: |-FunctionTemplateDecl {{.*}} col:6 imported in mod hidden invalid tan +// CHECK-NEXT: | |-NonTypeTemplateParmDecl {{.*}} col:34 imported in mod hidden referenced invalid 'ReferenceOf auto' depth 0 index 0 R +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} col:46 imported in mod hidden referenced typename depth 0 index 1 Rep +// CHECK-NEXT: | |-RequiresExpr {{.*}} 'bool' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} col:73 imported in mod hidden referenced v 'Rep' +// CHECK-NEXT: | | `-SimpleRequirement {{.*}} dependent +// CHECK-NEXT: | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | |-UnresolvedLookupExpr {{.*}} '' lvalue (ADL) = 'tan' empty +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'Rep' lvalue ParmVar {{.*}} 'v' 'Rep' non_odr_use_unevaluated +// CHECK-NEXT: | `-FunctionDecl {{.*}} col:6 imported in mod hidden tan 'auto (const Rep &)' +// CHECK-NEXT: | `-ParmVarDecl {{.*}} col:21 imported in mod hidden q 'const Rep &' From 4624087328961b2ad50935799e3b5eee49e90c23 Mon Sep 17 00:00:00 2001 From: Javier Lopez-Gomez Date: Wed, 19 Feb 2025 16:27:16 +0100 Subject: [PATCH 065/220] [llvm-dwarfdump] Print number of out-of-line functions described by DWARF (#127233) Some of the functions in `#functions` may have several inlined instances, but also an out-of-line definition. Therefore, for complex enough DWARF input, `#functions` - `#inlined functions` would not give us the number of out-of-line function definitions. `llvm-dwarfdump`, however, already keeps track of those; print it as part of the statistics, as this number is useful in certain scenarios. --- llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test | 1 + llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test | 1 + llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll | 1 + llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll | 1 + .../tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll | 1 + .../tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll | 1 + llvm/tools/llvm-dwarfdump/Statistics.cpp | 3 +++ 7 files changed, 9 insertions(+) diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test b/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test index 3e39591c46dce..81ca701e78a49 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test +++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test @@ -72,6 +72,7 @@ RUN: llvm-dwarfdump --statistics statistics-fib.split-dwarf.o | FileCheck %s CHECK: "version": 9, CHECK: "#functions": 3, CHECK: "#functions with location": 3, +CHECK: "#out-of-line functions": 3, CHECK: "#inlined functions": 7, CHECK: "#inlined functions with abstract origins": 7, CHECK: "#unique source variables": 9, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test b/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test index 855dcedc76f0b..82939c77e25d4 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test +++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test @@ -67,6 +67,7 @@ RUN: llvm-dwarfdump --statistics %t-statistics-fib.o | FileCheck %s CHECK: "version": 9, CHECK: "#functions": 3, CHECK: "#functions with location": 3, +CHECK: "#out-of-line functions": 3, CHECK: "#inlined functions": 8, CHECK: "#inlined functions with abstract origins": 8, CHECK: "#unique source variables": 9, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll index 05626e60ca0c7..97482e9c9b858 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll @@ -4,6 +4,7 @@ ; Test that abstract origins in multiple CUs are uniqued. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 3, ; CHECK: "#inlined functions": 2, ; CHECK: "#unique source variables": 4, ; CHECK-NEXT: "#source variables": 6, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll index 3e4feca06d56f..25f81f31d18ac 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll @@ -5,6 +5,7 @@ ; The results for both tests should be identical. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 3, ; CHECK: "#inlined functions": 2, ; CHECK: "#unique source variables": 4, ; CHECK-NEXT: "#source variables": 6, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll index 85f66f492ff78..6fd3b84fdc19a 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll @@ -20,6 +20,7 @@ ; CHECK: "#functions": 3, ; CHECK-NEXT: "#functions with location": 3, +; CHECK-NEXT: "#out-of-line functions": 4, ; CHECK-NEXT: "#inlined functions": 0, ; CHECK-NEXT: "#inlined functions with abstract origins": 0, ; CHECK-NEXT: "#unique source variables": 1, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll index 2f1e1e15aa3a9..60ca52a274375 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll @@ -4,6 +4,7 @@ ; Test that statistics distinguish functions with the same name. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 4, ; CHECK: "#unique source variables": 2, ; CHECK-NEXT: "#source variables": 2, diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp index 6f2919318a6d5..1670709c08314 100644 --- a/llvm/tools/llvm-dwarfdump/Statistics.cpp +++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp @@ -971,6 +971,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, SaturatingUINT64 VarParamUnique = 0; SaturatingUINT64 VarParamWithLoc = 0; SaturatingUINT64 NumFunctions = 0; + SaturatingUINT64 NumOutOfLineFunctions = 0; SaturatingUINT64 NumInlinedFunctions = 0; SaturatingUINT64 NumFuncsWithSrcLoc = 0; SaturatingUINT64 NumAbstractOrigins = 0; @@ -999,6 +1000,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, << Entry.getKey() << ": " << V.getKey() << "\n"); NumFunctions += Stats.IsFunction; NumFuncsWithSrcLoc += Stats.HasSourceLocation; + NumOutOfLineFunctions += Stats.IsFunction * Stats.NumFnOutOfLine; NumInlinedFunctions += Stats.IsFunction * Stats.NumFnInlined; NumAbstractOrigins += Stats.IsFunction * Stats.NumAbstractOrigins; ParamTotal += Stats.NumParams; @@ -1024,6 +1026,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, printDatum(J, "#functions", NumFunctions.Value); printDatum(J, "#functions with location", NumFuncsWithSrcLoc.Value); + printDatum(J, "#out-of-line functions", NumOutOfLineFunctions.Value); printDatum(J, "#inlined functions", NumInlinedFunctions.Value); printDatum(J, "#inlined functions with abstract origins", NumAbstractOrigins.Value); From 826af1757c99e98c5816fc3ffdb1cece78107991 Mon Sep 17 00:00:00 2001 From: c8ef Date: Wed, 19 Feb 2025 23:28:11 +0800 Subject: [PATCH 066/220] [libc] add `LLVM_LIBC_CAST` macro. (#127319) related: #127238 This patch adds a macro called `LLVM_LIBC_CAST`, similar to `__BIONIC_CAST`, for type conversion in `endian.h`. --- libc/include/__llvm-libc-common.h | 8 +++++++ libc/include/llvm-libc-macros/endian-macros.h | 24 +++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index 212e3c6a9446c..c6fd33a55532c 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -47,6 +47,11 @@ #define __NOEXCEPT throw() #endif +// This macro serves as a generic cast implementation for use in both C and C++, +// similar to `__BIONIC_CAST` in Android. +#undef __LLVM_LIBC_CAST +#define __LLVM_LIBC_CAST(cast, type, value) (cast(value)) + #else // not __cplusplus #undef __BEGIN_C_DECLS @@ -85,6 +90,9 @@ #undef _Returns_twice #define _Returns_twice __attribute__((returns_twice)) +#undef __LLVM_LIBC_CAST +#define __LLVM_LIBC_CAST(cast, type, value) ((type)(value)) + #endif // __cplusplus #endif // _LLVM_LIBC_COMMON_H diff --git a/libc/include/llvm-libc-macros/endian-macros.h b/libc/include/llvm-libc-macros/endian-macros.h index e1e105d50c1c6..52d95dc01cd83 100644 --- a/libc/include/llvm-libc-macros/endian-macros.h +++ b/libc/include/llvm-libc-macros/endian-macros.h @@ -20,27 +20,27 @@ #define htobe16(x) __builtin_bswap16((x)) #define htobe32(x) __builtin_bswap32((x)) #define htobe64(x) __builtin_bswap64((x)) -#define htole16(x) ((uint16_t)(x)) -#define htole32(x) ((uint32_t)(x)) -#define htole64(x) ((uint64_t)(x)) +#define htole16(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define htole32(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define htole64(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define be16toh(x) __builtin_bswap16((x)) #define be32toh(x) __builtin_bswap32((x)) #define be64toh(x) __builtin_bswap64((x)) -#define le16toh(x) ((uint16_t)(x)) -#define le32toh(x) ((uint32_t)(x)) -#define le64toh(x) ((uint64_t)(x)) +#define le16toh(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define le32toh(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define le64toh(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #else -#define htobe16(x) ((uint16_t)(x)) -#define htobe32(x) ((uint32_t)(x)) -#define htobe64(x) ((uint64_t)(x)) +#define htobe16(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define htobe32(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define htobe64(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define htole16(x) __builtin_bswap16((x)) #define htole32(x) __builtin_bswap32((x)) #define htole64(x) __builtin_bswap64((x)) -#define be16toh(x) ((uint16_t)(x)) -#define be32toh(x) ((uint32_t)(x)) -#define be64toh(x) ((uint64_t)(x)) +#define be16toh(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define be32toh(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define be64toh(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define le16toh(x) __builtin_bswap16((x)) #define le32toh(x) __builtin_bswap32((x)) #define le64toh(x) __builtin_bswap64((x)) From 80ccf01c337f09146a2c502fe624f07c4b04b848 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 19 Feb 2025 16:47:18 +0100 Subject: [PATCH 067/220] [Clang] Do not try to transform invalid bindings (#125658) In the presence of an invalid structured binding decomposition, some binding packs may be invalid and trying to transform them would produce a recovery expression that does not contains a pack, leading to assertions in places where we would expect a pack at that stage. Fixes #125165 --- clang/lib/Sema/TreeTransform.h | 2 +- clang/test/SemaCXX/cxx2c-binding-pack.cpp | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 05cac8db3c42c..eaabfae2409f4 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -12716,7 +12716,7 @@ TreeTransform::TransformDeclRefExpr(DeclRefExpr *E) { ValueDecl *ND = cast_or_null(getDerived().TransformDecl(E->getLocation(), E->getDecl())); - if (!ND) + if (!ND || ND->isInvalidDecl()) return ExprError(); NamedDecl *Found = ND; diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp index 62e1da565f2b5..0f10dad3937ba 100644 --- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp +++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp @@ -218,3 +218,18 @@ auto X = [] () { static_assert(sizeof...(pack3) == 5); }; } // namespace + +namespace GH125165 { + +template +auto f(auto t) { + const auto& [...pack] = t; + // expected-error@-1 {{cannot decompose non-class, non-array type 'char const'}} + (pack, ...); +}; + +void g() { + f('x'); // expected-note {{in instantiation}} +} + +} From 888c09986ce0b1a02ba904b167a3650b1d7eee2d Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 19 Feb 2025 16:48:38 +0100 Subject: [PATCH 068/220] [Clang] Add release note for #127623 (#127815) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While reviewing #127623, I missed that it didn’t have a release note. --- clang/docs/ReleaseNotes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a91c764860ccd..5780f5d61d579 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -170,6 +170,7 @@ Bug Fixes to C++ Support - Clang is now better at keeping track of friend function template instance contexts. (#GH55509) - The initialization kind of elements of structured bindings direct-list-initialized from an array is corrected to direct-initialization. +- Clang no longer crashes when a coroutine is declared ``[[noreturn]]``. (#GH127327) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ From 3e61c1ab7f5d9666db88069d49c8916c40fae5ea Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 19 Feb 2025 16:52:01 +0100 Subject: [PATCH 069/220] [libc++] Avoid code duplication in strings operator+ overloads (#126048) --- libcxx/include/string | 190 +++++++++++------------------------------- 1 file changed, 49 insertions(+), 141 deletions(-) diff --git a/libcxx/include/string b/libcxx/include/string index 396e73522d3e7..3f43e8fd8d586 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -691,50 +691,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD // basic_string -template -basic_string<_CharT, _Traits, _Allocator> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -_LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const _CharT* __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(_CharT __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const _CharT* __y); - template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, _CharT __y); - -# if _LIBCPP_STD_VER >= 26 - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - type_identity_t> __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs); - -# endif - -extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ - , allocator >(char const*, string const&); +__concatenate_strings(const _Allocator& __alloc, + __type_identity_t > __str1, + __type_identity_t > __str2); template struct __string_is_trivial_iterator : public false_type {}; @@ -2425,15 +2386,8 @@ private: std::__throw_out_of_range("basic_string"); } - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const value_type*, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(value_type, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const value_type*); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, value_type); -# if _LIBCPP_STD_VER >= 26 - friend constexpr basic_string operator+ <>(const basic_string&, type_identity_t<__self_view>); - friend constexpr basic_string operator+ <>(type_identity_t<__self_view>, const basic_string&); -# endif + friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string + __concatenate_strings<>(const _Allocator&, __type_identity_t<__self_view>, __type_identity_t<__self_view>); template friend inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool @@ -3815,83 +3769,73 @@ operator>=(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) { +__concatenate_strings(const _Allocator& __alloc, + __type_identity_t > __str1, + __type_identity_t > __str2) { using _String = basic_string<_CharT, _Traits, _Allocator>; - auto __lhs_sz = __lhs.size(); - auto __rhs_sz = __rhs.size(); _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); + __str1.size() + __str2.size(), + _String::__alloc_traits::select_on_container_copy_construction(__alloc)); auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); + _Traits::copy(__ptr, __str1.data(), __str1.size()); + _Traits::copy(__ptr + __str1.size(), __str2.data(), __str2.size()); + _Traits::assign(__ptr[__str1.size() + __str2.size()], _CharT()); return __r; } +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); +} + template _LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - auto __lhs_sz = _Traits::length(__lhs); - auto __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs, __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>(__rhs.get_allocator(), __lhs, __rhs); } +extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ + , allocator >(char const*, string const&); + template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(_CharT __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __rhs_sz + 1, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::assign(__ptr, 1, __lhs); - _Traits::copy(__ptr + 1, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + 1 + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>( + __rhs.get_allocator(), basic_string_view<_CharT, _Traits>(&__lhs, 1), __rhs); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = _Traits::length(__rhs); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs, __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); } template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, _CharT __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + 1, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::assign(__ptr + __lhs_sz, 1, __rhs); - _Traits::assign(__ptr + 1 + __lhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>( + __lhs.get_allocator(), __lhs, basic_string_view<_CharT, _Traits>(&__rhs, 1)); +} +# if _LIBCPP_STD_VER >= 26 + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + type_identity_t> __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); +} + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__rhs.get_allocator(), __lhs, __rhs); } +# endif // _LIBCPP_STD_VER >= 26 + # ifndef _LIBCPP_CXX03_LANG template @@ -3942,54 +3886,18 @@ operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, _CharT __rhs) { # if _LIBCPP_STD_VER >= 26 -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - type_identity_t> __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; -} - template _LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs) { - __lhs.append(__rhs); - return std::move(__lhs); -} - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::move(__lhs.append(__rhs)); } template _LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs) { - __rhs.insert(0, __lhs); - return std::move(__rhs); + return std::move(__rhs.insert(0, __lhs)); } # endif // _LIBCPP_STD_VER >= 26 From 2f2295cfae971a9564a3ba25b5c2338cfb36f154 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 19 Feb 2025 08:19:49 -0800 Subject: [PATCH 070/220] [Analysis] Avoid repeated hash lookups (NFC) (#127743) --- llvm/lib/Analysis/AssumeBundleQueries.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp index 21530693c5f18..c27bfa6f3cc2c 100644 --- a/llvm/lib/Analysis/AssumeBundleQueries.cpp +++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp @@ -85,13 +85,14 @@ void llvm::fillMapFromAssume(AssumeInst &Assume, RetainedKnowledgeMap &Result) { if (!CI) continue; uint64_t Val = CI->getZExtValue(); - auto Lookup = Result.find(Key); - if (Lookup == Result.end() || !Lookup->second.count(&Assume)) { - Result[Key][&Assume] = {Val, Val}; + auto [It, Inserted] = Result[Key].try_emplace(&Assume); + if (Inserted) { + It->second = {Val, Val}; continue; } - Lookup->second[&Assume].Min = std::min(Val, Lookup->second[&Assume].Min); - Lookup->second[&Assume].Max = std::max(Val, Lookup->second[&Assume].Max); + auto &MinMax = It->second; + MinMax.Min = std::min(Val, MinMax.Min); + MinMax.Max = std::max(Val, MinMax.Max); } } From c23256ecbd29103c800d24f83649057ae84acb09 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 19 Feb 2025 08:20:21 -0800 Subject: [PATCH 071/220] [AsmPrinter] Avoid repeated hash lookups (NFC) (#127744) --- llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 1c603f5988ad1..e8d1aba63afb4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -252,10 +252,10 @@ void EHStreamer::computeCallSiteTable( if (&MBB == &Asm->MF->front() || MBB.isBeginSection()) { // We start a call-site range upon function entry and at the beginning of // every basic block section. - CallSiteRanges.push_back( - {Asm->MBBSectionRanges[MBB.getSectionID()].BeginLabel, - Asm->MBBSectionRanges[MBB.getSectionID()].EndLabel, - Asm->getMBBExceptionSym(MBB), CallSites.size()}); + auto &Range = Asm->MBBSectionRanges[MBB.getSectionID()]; + CallSiteRanges.push_back({Range.BeginLabel, Range.EndLabel, + Asm->getMBBExceptionSym(MBB), + CallSites.size()}); PreviousIsInvoke = false; SawPotentiallyThrowing = false; LastLabel = nullptr; From af922cf9f7e7f126f2efaf9660ceea8e5eba21b5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 19 Feb 2025 08:20:46 -0800 Subject: [PATCH 072/220] [CodeGen] Avoid repeated hash lookups (NFC) (#127745) --- llvm/lib/CodeGen/WinEHPrepare.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index b98523cac1f2f..1970716485613 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -251,15 +251,15 @@ void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State, const BasicBlock *BB = WI->Block; int State = WI->State; delete WI; - if (auto It = EHInfo.BlockToStateMap.find(BB); - It != EHInfo.BlockToStateMap.end() && It->second <= State) + auto [StateIt, Inserted] = EHInfo.BlockToStateMap.try_emplace(BB); + if (!Inserted && StateIt->second <= State) continue; // skip blocks already visited by lower State BasicBlock::const_iterator It = BB->getFirstNonPHIIt(); const llvm::Instruction *TI = BB->getTerminator(); if (It->isEHPad()) State = EHInfo.EHPadStateMap[&*It]; - EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting + StateIt->second = State; // Record state, also flag visiting if ((isa(TI) || isa(TI)) && State > 0) { // Retrive the new State From 1bb72f0d7dd623e1c75dbe9e6a7f6b41f5284474 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 19 Feb 2025 08:21:10 -0800 Subject: [PATCH 073/220] [Object] Avoid repeated hash lookups (NFC) (#127746) --- llvm/lib/Object/GOFFObjectFile.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp index 7806953aecd29..a55005e689e62 100644 --- a/llvm/lib/Object/GOFFObjectFile.cpp +++ b/llvm/lib/Object/GOFFObjectFile.cpp @@ -503,8 +503,9 @@ GOFFObjectFile::getSectionContents(DataRefImpl Sec) const { std::copy(CompleteData.data(), CompleteData.data() + TxtDataSize, Data.begin() + TxtDataOffset); } - SectionDataCache[Sec.d.a] = Data; - return ArrayRef(SectionDataCache[Sec.d.a]); + auto &Cache = SectionDataCache[Sec.d.a]; + Cache = Data; + return ArrayRef(Cache); } uint64_t GOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const { From bb75a96900ad52b01e51fc42c3533a6febf97e27 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 19 Feb 2025 08:21:33 -0800 Subject: [PATCH 074/220] [Support] Avoid repeated hash lookups (NFC) (#127747) --- llvm/include/llvm/Support/DebugCounter.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index e4345e5739e99..8e9dc29e4f48a 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -162,8 +162,9 @@ class DebugCounter { protected: unsigned addCounter(const std::string &Name, const std::string &Desc) { unsigned Result = RegisteredCounters.insert(Name); - Counters[Result] = {}; - Counters[Result].Desc = Desc; + auto &C = Counters[Result]; + C = {}; + C.Desc = Desc; return Result; } // Struct to store counter info. From fc5849de6abba74dd0bb9a062b207ba4fcd7a37d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 19 Feb 2025 08:21:53 -0800 Subject: [PATCH 075/220] [X86] Avoid repeated hash lookups (NFC) (#127748) --- llvm/lib/Target/X86/X86PadShortFunction.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index c43fd97a055fc..2859195c6c26e 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -163,7 +163,8 @@ void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) { return; if (hasReturn) { - ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles); + unsigned int &NumCycles = ReturnBBs[MBB]; + NumCycles = std::max(NumCycles, Cycles); return; } From 9743b99cd1d1775f9f367e5f1c6d40ba09ec523b Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 19 Feb 2025 16:26:24 +0000 Subject: [PATCH 076/220] [libclc] Explicitly qualify private address spaces (#127823) Doing so provides stability when compiling the builtins in a mode in which unqualified pointers may be interpreted as being in the generic address space, such as in OpenCL 3.0. We eventually want to provide 'generic' overloads of the builtins in libclc so this prepares the ground a little better. It could be argued that having the internal CLC helper functions be unqualified is more flexible, in case it's better for a target to have the pointers in the generic address space. This commits to the private address space for more stability across different OpenCL environments. --- libclc/generic/lib/math/ep_log.cl | 3 ++- libclc/generic/lib/math/ep_log.h | 3 ++- libclc/generic/lib/math/modf.inc | 18 +++++++++-------- libclc/generic/lib/math/sincos_helpers.cl | 24 +++++++++++++---------- libclc/generic/lib/math/sincos_helpers.h | 12 +++++++----- 5 files changed, 35 insertions(+), 25 deletions(-) diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl index 65db94a85b9b4..f0b5d3fdfbb1c 100644 --- a/libclc/generic/lib/math/ep_log.cl +++ b/libclc/generic/lib/math/ep_log.cl @@ -38,7 +38,8 @@ #define LF1 1.24999999978138668903e-02 #define LF2 2.23219810758559851206e-03 -_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2) { +_CLC_DEF void __clc_ep_log(double x, private int *xexp, private double *r1, + private double *r2) { // Computes natural log(x). Algorithm based on: // Ping-Tak Peter Tang // "Table-driven implementation of the logarithm function in IEEE diff --git a/libclc/generic/lib/math/ep_log.h b/libclc/generic/lib/math/ep_log.h index 414e6231f7fd6..3176cfe5b42ce 100644 --- a/libclc/generic/lib/math/ep_log.h +++ b/libclc/generic/lib/math/ep_log.h @@ -26,6 +26,7 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL void __clc_ep_log(double x, int *xexp, double *r1, double *r2); +_CLC_DECL void __clc_ep_log(double x, private int *xexp, private double *r1, + private double *r2); #endif diff --git a/libclc/generic/lib/math/modf.inc b/libclc/generic/lib/math/modf.inc index 1ffc6d9e851bd..ff7ef30dd42f8 100644 --- a/libclc/generic/lib/math/modf.inc +++ b/libclc/generic/lib/math/modf.inc @@ -28,18 +28,20 @@ #define ZERO 0.0h #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, + private __CLC_GENTYPE *iptr) { *iptr = trunc(x); return copysign(isinf(x) ? ZERO : x - *iptr, x); } -#define MODF_DEF(addrspace) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ - __CLC_GENTYPE private_iptr; \ - __CLC_GENTYPE ret = modf(x, &private_iptr); \ - *iptr = private_iptr; \ - return ret; \ -} +#define MODF_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, \ + addrspace __CLC_GENTYPE *iptr) { \ + __CLC_GENTYPE private_iptr; \ + __CLC_GENTYPE ret = modf(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ + } MODF_DEF(local); MODF_DEF(global); diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl index 22f2bf61bf27d..441bad2be432f 100644 --- a/libclc/generic/lib/math/sincos_helpers.cl +++ b/libclc/generic/lib/math/sincos_helpers.cl @@ -119,8 +119,8 @@ _CLC_DEF float __clc_tanf_piby4(float x, int regn) { return regn & 1 ? tr : t; } -_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, - float bt) { +_CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a, + float b, float bh, float bt) { if (HAVE_HW_FMA32()) { float ph = a * b; *hi = ph; @@ -136,7 +136,7 @@ _CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, } } -_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { +_CLC_DEF float __clc_removePi2S(private float *hi, private float *lo, float x) { // 72 bits of pi/2 const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f; const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f; @@ -174,7 +174,8 @@ _CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { return fnpi2; } -_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionSmallS(private float *r, private float *rr, + float x) { float fnpi2 = __clc_removePi2S(r, rr, x); return (int)fnpi2 & 0x3; } @@ -188,7 +189,8 @@ _CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { HI = __clc_mul_hi(A, B); \ HI += LO < C -_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr, + float x) { int xe = (int)(as_uint(x) >> 23) - 127; uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); @@ -330,7 +332,7 @@ _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { return ((i >> 1) + (i & 1)) & 0x3; } -_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionS(private float *r, private float *rr, float x) { if (x < 0x1.0p+23f) return __clc_argReductionSmallS(r, rr, x); else @@ -342,8 +344,9 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { #pragma OPENCL EXTENSION cl_khr_fp64 : enable // Reduction for medium sized arguments -_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, - int *regn) { +_CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r, + private double *rr, + private int *regn) { // How many pi/2 is x a multiple of? const double two_by_pi = 0x1.45f306dc9c883p-1; double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5)); @@ -387,8 +390,9 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, // Return value "regn" tells how many lots of pi/2 were subtracted // from x to put it in the range [-pi/4,pi/4], mod 4. -_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, - int *regn) { +_CLC_DEF void __clc_remainder_piby2_large(double x, private double *r, + private double *rr, + private int *regn) { long ux = as_long(x); int e = (int)(ux >> 52) - 1023; diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h index 6dbca73aa2a2e..c7981e5278f2a 100644 --- a/libclc/generic/lib/math/sincos_helpers.h +++ b/libclc/generic/lib/math/sincos_helpers.h @@ -26,16 +26,18 @@ _CLC_DECL float __clc_sinf_piby4(float x, float y); _CLC_DECL float __clc_cosf_piby4(float x, float y); _CLC_DECL float __clc_tanf_piby4(float x, int y); -_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x); +_CLC_DECL int __clc_argReductionS(private float *r, private float *rr, float x); #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, - int *regn); -_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, - int *regn); +_CLC_DECL void __clc_remainder_piby2_medium(double x, private double *r, + private double *rr, + private int *regn); +_CLC_DECL void __clc_remainder_piby2_large(double x, private double *r, + private double *rr, + private int *regn); _CLC_DECL double2 __clc_sincos_piby4(double x, double xx); #endif From 65998ab2cb5069871799cd6d0977954f14cbb93e Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 19 Feb 2025 08:31:40 -0800 Subject: [PATCH 077/220] [lldb] Make GetOutputStreamSP and GetErrorStreamSP protected (#127682) This makes GetOutputStreamSP and GetErrorStreamSP protected members of Debugger. Users who want to print to the debugger's stream should use GetAsyncOutputStreamSP and GetAsyncErrorStreamSP instead and the few remaining stragglers have been migrated. --- lldb/include/lldb/Core/Debugger.h | 20 ++++++---- lldb/include/lldb/Target/ThreadPlanTracer.h | 2 +- lldb/source/API/SBDebugger.cpp | 12 +++--- lldb/source/Commands/CommandObjectGUI.cpp | 8 ++-- lldb/source/Commands/CommandObjectLog.cpp | 3 +- lldb/source/Core/Debugger.cpp | 8 ++-- .../source/Interpreter/CommandInterpreter.cpp | 4 +- lldb/source/Interpreter/ScriptInterpreter.cpp | 4 +- .../DynamicLoaderDarwinKernel.cpp | 9 +++-- .../UBSan/InstrumentationRuntimeUBSan.cpp | 2 - .../Utility/ReportRetriever.cpp | 4 +- .../Lua/ScriptInterpreterLua.cpp | 4 +- lldb/source/Target/ThreadPlanTracer.cpp | 39 ++++++++++--------- 13 files changed, 62 insertions(+), 57 deletions(-) diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index d7751ca045bb2..7f08f3dd26106 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -131,17 +131,13 @@ class Debugger : public std::enable_shared_from_this, void SetAsyncExecution(bool async); - lldb::FileSP GetInputFileSP() { return m_input_file_sp; } - - lldb::StreamFileSP GetOutputStreamSP() { return m_output_stream_sp; } - - lldb::StreamFileSP GetErrorStreamSP() { return m_error_stream_sp; } - File &GetInputFile() { return *m_input_file_sp; } - File &GetOutputFile() { return m_output_stream_sp->GetFile(); } + lldb::FileSP GetInputFileSP() { return m_input_file_sp; } + + lldb::FileSP GetOutputFileSP() { return m_output_stream_sp->GetFileSP(); } - File &GetErrorFile() { return m_error_stream_sp->GetFile(); } + lldb::FileSP GetErrorFileSP() { return m_error_stream_sp->GetFileSP(); } repro::DataRecorder *GetInputRecorder(); @@ -649,6 +645,14 @@ class Debugger : public std::enable_shared_from_this, void PrintProgress(const ProgressEventData &data); + /// Except for Debugger and IOHandler, GetOutputStreamSP and GetErrorStreamSP + /// should not be used directly. Use GetAsyncOutputStream and + /// GetAsyncErrorStream instead. + /// @{ + lldb::StreamFileSP GetOutputStreamSP() { return m_output_stream_sp; } + lldb::StreamFileSP GetErrorStreamSP() { return m_error_stream_sp; } + /// @} + void PushIOHandler(const lldb::IOHandlerSP &reader_sp, bool cancel_top_handler = true); diff --git a/lldb/include/lldb/Target/ThreadPlanTracer.h b/lldb/include/lldb/Target/ThreadPlanTracer.h index a6fd2f031dc22..7c45e213f94f1 100644 --- a/lldb/include/lldb/Target/ThreadPlanTracer.h +++ b/lldb/include/lldb/Target/ThreadPlanTracer.h @@ -56,7 +56,7 @@ class ThreadPlanTracer { Process &m_process; lldb::tid_t m_tid; - Stream *GetLogStream(); + lldb::StreamSP GetLogStreamSP(); virtual void Log(); diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index bf19d2ff8333c..e646b09e05852 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -509,14 +509,14 @@ SBFile SBDebugger::GetInputFile() { FILE *SBDebugger::GetOutputFileHandle() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return m_opaque_sp->GetOutputStreamSP()->GetFile().GetStream(); + return m_opaque_sp->GetOutputFileSP()->GetStream(); return nullptr; } SBFile SBDebugger::GetOutputFile() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return SBFile(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); + return SBFile(m_opaque_sp->GetOutputFileSP()); return SBFile(); } @@ -524,7 +524,7 @@ FILE *SBDebugger::GetErrorFileHandle() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return m_opaque_sp->GetErrorStreamSP()->GetFile().GetStream(); + return m_opaque_sp->GetErrorFileSP()->GetStream(); return nullptr; } @@ -532,7 +532,7 @@ SBFile SBDebugger::GetErrorFile() { LLDB_INSTRUMENT_VA(this); SBFile file; if (m_opaque_sp) - return SBFile(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); + return SBFile(m_opaque_sp->GetErrorFileSP()); return SBFile(); } @@ -573,8 +573,8 @@ void SBDebugger::HandleCommand(const char *command) { sb_interpreter.HandleCommand(command, result, false); - result.PutError(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); - result.PutOutput(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); + result.PutError(m_opaque_sp->GetErrorFileSP()); + result.PutOutput(m_opaque_sp->GetOutputFileSP()); if (!m_opaque_sp->GetAsyncExecution()) { SBProcess process(GetCommandInterpreter().GetProcess()); diff --git a/lldb/source/Commands/CommandObjectGUI.cpp b/lldb/source/Commands/CommandObjectGUI.cpp index b56e49b073b03..8630171bae9d1 100644 --- a/lldb/source/Commands/CommandObjectGUI.cpp +++ b/lldb/source/Commands/CommandObjectGUI.cpp @@ -28,10 +28,10 @@ void CommandObjectGUI::DoExecute(Args &args, CommandReturnObject &result) { #if LLDB_ENABLE_CURSES Debugger &debugger = GetDebugger(); - File &input = debugger.GetInputFile(); - File &output = debugger.GetOutputFile(); - if (input.GetStream() && output.GetStream() && input.GetIsRealTerminal() && - input.GetIsInteractive()) { + FileSP input_sp = debugger.GetInputFileSP(); + FileSP output_sp = debugger.GetOutputFileSP(); + if (input_sp->GetStream() && output_sp->GetStream() && + input_sp->GetIsRealTerminal() && input_sp->GetIsInteractive()) { IOHandlerSP io_handler_sp(new IOHandlerCursesGUI(debugger)); if (io_handler_sp) debugger.RunIOHandlerAsync(io_handler_sp); diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp index 5fb2dfaab8de0..17efae189b05e 100644 --- a/lldb/source/Commands/CommandObjectLog.cpp +++ b/lldb/source/Commands/CommandObjectLog.cpp @@ -394,7 +394,8 @@ class CommandObjectLogDump : public CommandObjectParsed { (*file)->GetDescriptor(), /*shouldClose=*/true); } else { stream_up = std::make_unique( - GetDebugger().GetOutputFile().GetDescriptor(), /*shouldClose=*/false); + GetDebugger().GetOutputFileSP()->GetDescriptor(), + /*shouldClose=*/false); } const std::string channel = std::string(args[0].ref()); diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 18569e155b517..18cdec4e0af73 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -947,7 +947,7 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) if (term && !strcmp(term, "dumb")) SetUseColor(false); // Turn off use-color if we don't write to a terminal with color support. - if (!GetOutputFile().GetIsTerminalWithColors()) + if (!GetOutputFileSP()->GetIsTerminalWithColors()) SetUseColor(false); if (Diagnostics::Enabled()) { @@ -1678,7 +1678,7 @@ bool Debugger::EnableLog(llvm::StringRef channel, LLDB_LOG_OPTION_PREPEND_TIMESTAMP | LLDB_LOG_OPTION_PREPEND_THREAD_NAME; } else if (log_file.empty()) { log_handler_sp = - CreateLogHandler(log_handler_kind, GetOutputFile().GetDescriptor(), + CreateLogHandler(log_handler_kind, GetOutputFileSP()->GetDescriptor(), /*should_close=*/false, buffer_size); } else { auto pos = m_stream_handlers.find(log_file); @@ -2111,8 +2111,8 @@ void Debugger::HandleProgressEvent(const lldb::EventSP &event_sp) { // Determine whether the current output file is an interactive terminal with // color support. We assume that if we support ANSI escape codes we support // vt100 escape codes. - File &file = GetOutputFile(); - if (!file.GetIsInteractive() || !file.GetIsTerminalWithColors()) + FileSP file_sp = GetOutputFileSP(); + if (!file_sp->GetIsInteractive() || !file_sp->GetIsTerminalWithColors()) return; StreamSP output = GetAsyncOutputStream(); diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index acdec84a1689b..5346d5a2d162a 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2837,8 +2837,8 @@ void CommandInterpreter::HandleCommandsFromFile( } if (flags & eHandleCommandFlagPrintResult) { - debugger.GetOutputFile().Printf("Executing commands in '%s'.\n", - cmd_file_path.c_str()); + debugger.GetOutputFileSP()->Printf("Executing commands in '%s'.\n", + cmd_file_path.c_str()); } // Used for inheriting the right settings when "command source" might diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 8d10e5de01225..a392d5777a021 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -245,8 +245,8 @@ ScriptInterpreterIORedirect::ScriptInterpreterIORedirect( if (outfile_handle) ::setbuf(outfile_handle, nullptr); - result->SetImmediateOutputFile(debugger.GetOutputStreamSP()->GetFileSP()); - result->SetImmediateErrorFile(debugger.GetErrorStreamSP()->GetFileSP()); + result->SetImmediateOutputFile(debugger.GetOutputFileSP()); + result->SetImmediateErrorFile(debugger.GetErrorFileSP()); } } diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index cff44b588e26e..1d4cda6c046b7 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -1193,7 +1193,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { m_kext_summary_header.version = data.GetU32(&offset); if (m_kext_summary_header.version > 128) { lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable version number %u\n", m_kext_summary_header.version); @@ -1208,7 +1208,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { // If we get an improbably large entry_size, we're probably // getting bad memory. lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable entry_size %u\n", m_kext_summary_header.entry_size); @@ -1226,7 +1226,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { // If we get an improbably large number of kexts, we're probably // getting bad memory. lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable number of kexts %u\n", m_kext_summary_header.entry_count); @@ -1330,7 +1330,8 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( number_of_old_kexts_being_removed == 0) return true; - lldb::StreamSP s = m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + lldb::StreamSP s = + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); if (load_kexts) { if (number_of_new_kexts_being_added > 0 && number_of_old_kexts_being_removed > 0) { diff --git a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp index 8c2700cf21de9..c2db3540a797b 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp @@ -116,8 +116,6 @@ StructuredData::ObjectSP InstrumentationRuntimeUBSan::RetrieveReportData( if (!frame_sp) return StructuredData::ObjectSP(); - StreamFileSP Stream = target.GetDebugger().GetOutputStreamSP(); - EvaluateExpressionOptions options; options.SetUnwindOnError(true); options.SetTryAllThreads(true); diff --git a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp index 74e0fa7d49f82..d61c59776eee6 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp @@ -210,8 +210,8 @@ bool ReportRetriever::NotifyBreakpointHit(ProcessSP process_sp, InstrumentationRuntimeStopInfo::CreateStopReasonWithInstrumentationData( *thread_sp, description, report)); - if (StreamFileSP stream_sp = StreamFileSP( - process_sp->GetTarget().GetDebugger().GetOutputStreamSP())) + if (StreamSP stream_sp = + process_sp->GetTarget().GetDebugger().GetAsyncOutputStream()) stream_sp->Printf("AddressSanitizer report breakpoint hit. Use 'thread " "info -s' to get extended information about the " "report.\n"); diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index 7e8eee9f5aa4f..6d028e324ee4e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -45,8 +45,8 @@ class IOHandlerLuaInterpreter : public IOHandlerDelegate, m_script_interpreter(script_interpreter), m_active_io_handler(active_io_handler) { llvm::cantFail(m_script_interpreter.GetLua().ChangeIO( - debugger.GetOutputFile().GetStream(), - debugger.GetErrorFile().GetStream())); + debugger.GetOutputFileSP()->GetStream(), + debugger.GetErrorFileSP()->GetStream())); llvm::cantFail(m_script_interpreter.EnterSession(debugger.GetID())); } diff --git a/lldb/source/Target/ThreadPlanTracer.cpp b/lldb/source/Target/ThreadPlanTracer.cpp index a119bf8589279..ab63cc7f6c223 100644 --- a/lldb/source/Target/ThreadPlanTracer.cpp +++ b/lldb/source/Target/ThreadPlanTracer.cpp @@ -27,6 +27,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/State.h" +#include "lldb/lldb-forward.h" using namespace lldb; using namespace lldb_private; @@ -41,13 +42,13 @@ ThreadPlanTracer::ThreadPlanTracer(Thread &thread) : m_process(*thread.GetProcess().get()), m_tid(thread.GetID()), m_enabled(false), m_stream_sp(), m_thread(nullptr) {} -Stream *ThreadPlanTracer::GetLogStream() { +StreamSP ThreadPlanTracer::GetLogStreamSP() { if (m_stream_sp) - return m_stream_sp.get(); + return m_stream_sp; else { TargetSP target_sp(GetThread().CalculateTarget()); if (target_sp) - return target_sp->GetDebugger().GetOutputStreamSP().get(); + return target_sp->GetDebugger().GetAsyncOutputStream(); } return nullptr; } @@ -65,12 +66,11 @@ void ThreadPlanTracer::Log() { bool show_frame_index = false; bool show_fullpaths = false; - Stream *stream = GetLogStream(); - if (stream) { - GetThread().GetStackFrameAtIndex(0)->Dump(stream, show_frame_index, + if (StreamSP stream_sp = GetLogStreamSP()) { + GetThread().GetStackFrameAtIndex(0)->Dump(stream_sp.get(), show_frame_index, show_fullpaths); - stream->Printf("\n"); - stream->Flush(); + stream_sp->Printf("\n"); + stream_sp->Flush(); } } @@ -129,9 +129,9 @@ void ThreadPlanAssemblyTracer::TracingStarted() { void ThreadPlanAssemblyTracer::TracingEnded() { m_register_values.clear(); } void ThreadPlanAssemblyTracer::Log() { - Stream *stream = GetLogStream(); + StreamSP stream_sp = GetLogStreamSP(); - if (!stream) + if (!stream_sp) return; RegisterContext *reg_ctx = GetThread().GetRegisterContext().get(); @@ -142,9 +142,10 @@ void ThreadPlanAssemblyTracer::Log() { uint8_t buffer[16] = {0}; // Must be big enough for any single instruction addr_valid = m_process.GetTarget().ResolveLoadAddress(pc, pc_addr); - pc_addr.Dump(stream, &GetThread(), Address::DumpStyleResolvedDescription, + pc_addr.Dump(stream_sp.get(), &GetThread(), + Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); - stream->PutCString(" "); + stream_sp->PutCString(" "); Disassembler *disassembler = GetDisassembler(); if (disassembler) { @@ -175,7 +176,7 @@ void ThreadPlanAssemblyTracer::Log() { instruction_list.GetInstructionAtIndex(0).get(); const FormatEntity::Entry *disassemble_format = m_process.GetTarget().GetDebugger().GetDisassemblyFormat(); - instruction->Dump(stream, max_opcode_byte_size, show_address, + instruction->Dump(stream_sp.get(), max_opcode_byte_size, show_address, show_bytes, show_control_flow_kind, nullptr, nullptr, nullptr, disassemble_format, 0); } @@ -198,12 +199,12 @@ void ThreadPlanAssemblyTracer::Log() { if (abi->GetArgumentValues(GetThread(), value_list)) { for (int arg_index = 0; arg_index < num_args; ++arg_index) { - stream->Printf( + stream_sp->Printf( "\n\targ[%d]=%llx", arg_index, value_list.GetValueAtIndex(arg_index)->GetScalar().ULongLong()); if (arg_index + 1 < num_args) - stream->PutCString(", "); + stream_sp->PutCString(", "); } } } @@ -222,14 +223,14 @@ void ThreadPlanAssemblyTracer::Log() { if (m_register_values[reg_num].GetType() == RegisterValue::eTypeInvalid || reg_value != m_register_values[reg_num]) { if (reg_value.GetType() != RegisterValue::eTypeInvalid) { - stream->PutCString("\n\t"); - DumpRegisterValue(reg_value, *stream, *reg_info, true, false, + stream_sp->PutCString("\n\t"); + DumpRegisterValue(reg_value, *stream_sp, *reg_info, true, false, eFormatDefault); } } m_register_values[reg_num] = reg_value; } } - stream->EOL(); - stream->Flush(); + stream_sp->EOL(); + stream_sp->Flush(); } From e2ba1b6ffde4ec607342b1b746d1b57f0f04390a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 19 Feb 2025 11:28:52 -0500 Subject: [PATCH 078/220] Revert "Reapply [CaptureTracking][FunctionAttrs] Add support for CaptureInfo (#125880)" This reverts commit 0fab404ee874bc5b0c442d1841c7d2005c3f8729. Seems to break LTO builds of clang on Windows, see comments on https://github.com/llvm/llvm-project/pull/125880 --- clang/test/CodeGen/allow-ubsan-check.c | 6 +- .../RelativeVTablesABI/dynamic-cast.cpp | 6 +- .../RelativeVTablesABI/type-info.cpp | 2 +- .../CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl | 4 +- clang/test/CodeGenOpenCL/as_type.cl | 2 +- llvm/include/llvm/Analysis/CaptureTracking.h | 68 ++----- llvm/include/llvm/IR/InstrTypes.h | 5 - llvm/include/llvm/Support/ModRef.h | 13 -- llvm/lib/Analysis/AliasAnalysis.cpp | 12 +- llvm/lib/Analysis/CaptureTracking.cpp | 127 +++++------- llvm/lib/Analysis/InstructionSimplify.cpp | 7 +- llvm/lib/IR/Instructions.cpp | 14 -- .../Transforms/IPO/AttributorAttributes.cpp | 38 ++-- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 149 +++++--------- .../InstCombine/InstCombineCompares.cpp | 7 +- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 45 +++-- .../FunctionAttrs/2009-01-02-LocalStores.ll | 2 +- .../Transforms/FunctionAttrs/arg_returned.ll | 24 +-- .../Transforms/FunctionAttrs/nocapture.ll | 187 ++---------------- llvm/test/Transforms/FunctionAttrs/nonnull.ll | 28 +-- llvm/test/Transforms/FunctionAttrs/noundef.ll | 8 +- .../Transforms/FunctionAttrs/readattrs.ll | 8 +- llvm/test/Transforms/FunctionAttrs/stats.ll | 4 +- .../AArch64/block_scaling_decompr_8bit.ll | 2 +- .../PhaseOrdering/bitcast-store-branch.ll | 2 +- .../dce-after-argument-promotion-loads.ll | 2 +- .../enable-loop-header-duplication-oz.ll | 4 +- .../Analysis/CaptureTrackingTest.cpp | 4 +- 28 files changed, 241 insertions(+), 539 deletions(-) diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index c116604288546..0cd81a77f5cc5 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -86,7 +86,7 @@ int div(int x, int y) { } // CHECK-LABEL: define dso_local i32 @null( -// CHECK-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // @@ -102,7 +102,7 @@ int div(int x, int y) { // CHECK-NEXT: ret i32 [[TMP2]] // // TR-LABEL: define dso_local i32 @null( -// TR-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// TR-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // TR-NEXT: [[ENTRY:.*:]] // TR-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // TR-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] @@ -116,7 +116,7 @@ int div(int x, int y) { // TR-NEXT: ret i32 [[TMP2]] // // REC-LABEL: define dso_local i32 @null( -// REC-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// REC-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // REC-NEXT: [[ENTRY:.*:]] // REC-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // REC-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp index 3662a270713b6..83daf57be22ff 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s -// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } @@ -22,12 +22,12 @@ // CHECK: declare ptr @__dynamic_cast(ptr, ptr, ptr, i64) local_unnamed_addr -// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned %b) local_unnamed_addr // CHECK-NEXT: entry // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } -// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly captures(address_is_null, ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %b, null // CHECK-NEXT: br i1 [[isnull]], label %[[dynamic_cast_end:[a-z0-9._]+]], label %[[dynamic_cast_notnull:[a-z0-9._]+]] diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp index 2a838708ca231..c471e5dbd7b33 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp @@ -24,7 +24,7 @@ // CHECK-NEXT: ret ptr @_ZTS1A // CHECK-NEXT: } -// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly captures(address_is_null) %a) local_unnamed_addr +// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly %a) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %a, null // CHECK-NEXT: br i1 [[isnull]], label %[[bad_typeid:[a-z0-9._]+]], label %[[end:[a-z0-9.+]+]] diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl index 62fd20c4d1414..0aadaad2dca5c 100644 --- a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl +++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl @@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t getBuffer(void *p) { } // CHECK-LABEL: define {{[^@]+}}@consumeBufferPtr -// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5)) // CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] @@ -39,7 +39,7 @@ void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) { } // CHECK-LABEL: define {{[^@]+}}@test -// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 diff --git a/clang/test/CodeGenOpenCL/as_type.cl b/clang/test/CodeGenOpenCL/as_type.cl index 2c6cdc3810b4d..1fe26fbeafdb4 100644 --- a/clang/test/CodeGenOpenCL/as_type.cl +++ b/clang/test/CodeGenOpenCL/as_type.cl @@ -67,7 +67,7 @@ int3 f8(char16 x) { return __builtin_astype(x, int3); } -//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone captures(ret: address, provenance) %[[x:.*]]) +//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone %[[x:.*]]) //CHECK: %[[cast:.*]] ={{.*}} addrspacecast ptr %[[x]] to ptr addrspace(1) //CHECK: ret ptr addrspace(1) %[[cast]] global int* addr_cast(int *x) { diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h index 573df8833bd46..06a00d9ae7899 100644 --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -14,13 +14,11 @@ #define LLVM_ANALYSIS_CAPTURETRACKING_H #include "llvm/ADT/DenseMap.h" -#include "llvm/Support/ModRef.h" namespace llvm { class Value; class Use; - class CaptureInfo; class DataLayout; class Instruction; class DominatorTree; @@ -79,47 +77,10 @@ namespace llvm { const DominatorTree &DT, unsigned MaxUsesToExplore = 0); - /// Capture information for a specific Use. - struct UseCaptureInfo { - /// Components captured by this use. - CaptureComponents UseCC; - /// Components captured by the return value of the user of this Use. - CaptureComponents ResultCC; - - UseCaptureInfo(CaptureComponents UseCC, - CaptureComponents ResultCC = CaptureComponents::None) - : UseCC(UseCC), ResultCC(ResultCC) {} - - static UseCaptureInfo passthrough() { - return UseCaptureInfo(CaptureComponents::None, CaptureComponents::All); - } - - bool isPassthrough() const { - return capturesNothing(UseCC) && capturesAnything(ResultCC); - } - - operator CaptureComponents() const { return UseCC | ResultCC; } - }; - /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. struct CaptureTracker { - /// Action returned from captures(). - enum Action { - /// Stop the traversal. - Stop, - /// Continue traversal, and also follow the return value of the user if - /// it has additional capture components (that is, if it has capture - /// components in Ret that are not part of Other). - Continue, - /// Continue traversal, but do not follow the return value of the user, - /// even if it has additional capture components. Should only be used if - /// captures() has already taken the potential return captures into - /// account. - ContinueIgnoringReturn, - }; - virtual ~CaptureTracker(); /// tooManyUses - The depth of traversal has breached a limit. There may be @@ -133,12 +94,10 @@ namespace llvm { /// U->getUser() is always an Instruction. virtual bool shouldExplore(const Use *U); - /// Use U directly captures CI.UseCC and additionally CI.ResultCC - /// through the return value of the user of U. - /// - /// Return one of Stop, Continue or ContinueIgnoringReturn to control - /// further traversal. - virtual Action captured(const Use *U, UseCaptureInfo CI) = 0; + /// captured - Information about the pointer was captured by the user of + /// use U. Return true to stop the traversal or false to continue looking + /// for more capturing instructions. + virtual bool captured(const Use *U) = 0; /// isDereferenceableOrNull - Overload to allow clients with additional /// knowledge about pointer dereferenceability to provide it and thereby @@ -146,18 +105,21 @@ namespace llvm { virtual bool isDereferenceableOrNull(Value *O, const DataLayout &DL); }; + /// Types of use capture kinds, see \p DetermineUseCaptureKind. + enum class UseCaptureKind { + NO_CAPTURE, + MAY_CAPTURE, + PASSTHROUGH, + }; + /// Determine what kind of capture behaviour \p U may exhibit. /// - /// The returned UseCaptureInfo contains the components captured directly - /// by the use (UseCC) and the components captured through the return value - /// of the user (ResultCC). - /// - /// \p Base is the starting value of the capture analysis, which is - /// relevant for address_is_null captures. + /// A use can be no-capture, a use can potentially capture, or a use can be + /// passthrough such that the uses of the user or \p U should be inspected. /// The \p IsDereferenceableOrNull callback is used to rule out capturing for /// certain comparisons. - UseCaptureInfo - DetermineUseCaptureKind(const Use &U, const Value *Base, + UseCaptureKind + DetermineUseCaptureKind(const Use &U, llvm::function_ref IsDereferenceableOrNull); diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 8e47e3c7b3a7c..90fe864d4ae71 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1692,11 +1692,6 @@ class CallBase : public Instruction { return capturesNothing(getCaptureInfo(OpNo)); } - /// Returns whether the call has an argument that has an attribute like - /// captures(ret: address, provenance), where the return capture components - /// are not a subset of the other capture components. - bool hasArgumentWithAdditionalReturnCaptureComponents() const; - /// Determine whether this argument is passed by value. bool isByValArgument(unsigned ArgNo) const { return paramHasAttr(ArgNo, Attribute::ByVal); diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 7f58f5236aedd..eb660844b0b3a 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -326,10 +326,6 @@ inline bool capturesFullProvenance(CaptureComponents CC) { return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance; } -inline bool capturesAll(CaptureComponents CC) { - return CC == CaptureComponents::All; -} - raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC); /// Represents which components of the pointer may be captured in which @@ -354,15 +350,6 @@ class CaptureInfo { /// Create CaptureInfo that may capture all components of the pointer. static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); } - /// Create CaptureInfo that may only capture via the return value. - static CaptureInfo - retOnly(CaptureComponents RetComponents = CaptureComponents::All) { - return CaptureInfo(CaptureComponents::None, RetComponents); - } - - /// Whether the pointer is only captured via the return value. - bool isRetOnly() const { return capturesNothing(OtherComponents); } - /// Get components potentially captured by the return value. CaptureComponents getRetComponents() const { return RetComponents; } diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index aa72fb844ef32..1a9136e464d25 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -835,15 +835,9 @@ bool llvm::isBaseOfObject(const Value *V) { } bool llvm::isEscapeSource(const Value *V) { - if (auto *CB = dyn_cast(V)) { - if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, true)) - return false; - - // The return value of a function with a captures(ret: address, provenance) - // attribute is not necessarily an escape source. The return value may - // alias with a non-escaping object. - return !CB->hasArgumentWithAdditionalReturnCaptureComponents(); - } + if (auto *CB = dyn_cast(V)) + return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, + true); // The load case works because isNonEscapingLocalObject considers all // stores to be escapes (it passes true for the StoreCaptures argument diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 5120b910e7896..49baf2eb84bb3 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -81,15 +81,14 @@ struct SimpleCaptureTracker : public CaptureTracker { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { if (isa(U->getUser()) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; LLVM_DEBUG(dbgs() << "Captured by: " << *U->getUser() << "\n"); Captured = true; - return Stop; + return true; } bool ReturnCaptures; @@ -123,21 +122,19 @@ struct CapturesBefore : public CaptureTracker { return !isPotentiallyReachable(I, BeforeHere, nullptr, DT, LI); } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; // Check isSafeToPrune() here rather than in shouldExplore() to avoid // an expensive reachability query for every instruction we look at. // Instead we only do one for actual capturing candidates. if (isSafeToPrune(I)) - // If the use is not reachable, the instruction result isn't either. - return ContinueIgnoringReturn; + return false; Captured = true; - return Stop; + return true; } const Instruction *BeforeHere; @@ -169,11 +166,10 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = &*F.getEntryBlock().begin(); } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; if (!EarliestCapture) EarliestCapture = I; @@ -181,10 +177,9 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = DT.findNearestCommonDominator(EarliestCapture, I); Captured = true; - // Continue analysis, as we need to see all potential captures. However, - // we do not need to follow the instruction result, as this use will - // dominate any captures made through the instruction result.. - return ContinueIgnoringReturn; + // Return false to continue analysis; we need to see all potential + // captures. + return false; } Instruction *EarliestCapture = nullptr; @@ -279,26 +274,25 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, return CB.EarliestCapture; } -UseCaptureInfo llvm::DetermineUseCaptureKind( - const Use &U, const Value *Base, +UseCaptureKind llvm::DetermineUseCaptureKind( + const Use &U, function_ref IsDereferenceableOrNull) { Instruction *I = dyn_cast(U.getUser()); // TODO: Investigate non-instruction uses. if (!I) - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: { - // TODO(captures): Make this more precise. auto *Call = cast(I); // Not captured if the callee is readonly, doesn't return a copy through // its return value and doesn't unwind (a readonly function can leak bits // by throwing an exception or not depending on the input value). if (Call->onlyReadsMemory() && Call->doesNotThrow() && Call->getType()->isVoidTy()) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; // The pointer is not captured if returned pointer is not captured. // NOTE: CaptureTracking users should not assume that only functions @@ -306,13 +300,13 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // getUnderlyingObject in ValueTracking or DecomposeGEPExpression // in BasicAA also need to know about this property. if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true)) - return UseCaptureInfo::passthrough(); + return UseCaptureKind::PASSTHROUGH; // Volatile operations effectively capture the memory location that they // load and store to. if (auto *MI = dyn_cast(Call)) if (MI->isVolatile()) - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; // Calling a function pointer does not in itself cause the pointer to // be captured. This is a subtle point considering that (for example) @@ -321,27 +315,30 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // captured, even though the loaded value might be the pointer itself // (think of self-referential objects). if (Call->isCallee(&U)) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; // Not captured if only passed via 'nocapture' arguments. assert(Call->isDataOperand(&U) && "Non-callee must be data operand"); - CaptureInfo CI = Call->getCaptureInfo(Call->getDataOperandNo(&U)); - return UseCaptureInfo(CI.getOtherComponents(), CI.getRetComponents()); + if (!Call->doesNotCapture(Call->getDataOperandNo(&U))) { + // The parameter is not marked 'nocapture' - captured. + return UseCaptureKind::MAY_CAPTURE; + } + return UseCaptureKind::NO_CAPTURE; } case Instruction::Load: // Volatile loads make the address observable. if (cast(I)->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; case Instruction::VAArg: // "va-arg" from a pointer does not cause it to be captured. - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; case Instruction::Store: // Stored the pointer - conservatively assume it may be captured. // Volatile stores make the address observable. if (U.getOperandNo() == 0 || cast(I)->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; case Instruction::AtomicRMW: { // atomicrmw conceptually includes both a load and store from // the same location. @@ -350,8 +347,8 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ARMWI = cast(I); if (U.getOperandNo() == 1 || ARMWI->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; } case Instruction::AtomicCmpXchg: { // cmpxchg conceptually includes both a load and store from @@ -361,35 +358,31 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ACXI = cast(I); if (U.getOperandNo() == 1 || U.getOperandNo() == 2 || ACXI->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; } case Instruction::GetElementPtr: // AA does not support pointers of vectors, so GEP vector splats need to // be considered as captures. if (I->getType()->isVectorTy()) - return CaptureComponents::All; - return UseCaptureInfo::passthrough(); + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::PASSTHROUGH; case Instruction::BitCast: case Instruction::PHI: case Instruction::Select: case Instruction::AddrSpaceCast: // The original value is not captured via this if the new value isn't. - return UseCaptureInfo::passthrough(); + return UseCaptureKind::PASSTHROUGH; case Instruction::ICmp: { unsigned Idx = U.getOperandNo(); unsigned OtherIdx = 1 - Idx; - if (isa(I->getOperand(OtherIdx)) && - cast(I)->isEquality()) { - // TODO(captures): Remove these special cases once we make use of - // captures(address_is_null). - + if (auto *CPN = dyn_cast(I->getOperand(OtherIdx))) { // Don't count comparisons of a no-alias return value against null as // captures. This allows us to ignore comparisons of malloc results // with null, for example. - if (U->getType()->getPointerAddressSpace() == 0) + if (CPN->getType()->getAddressSpace() == 0) if (isNoAliasCall(U.get()->stripPointerCasts())) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; if (!I->getFunction()->nullPointerIsDefined()) { auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation(); // Comparing a dereferenceable_or_null pointer against null cannot @@ -397,23 +390,17 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // valid (in-bounds) pointer. const DataLayout &DL = I->getDataLayout(); if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL)) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; } - - // Check whether this is a comparison of the base pointer against - // null. - if (U.get() == Base) - return CaptureComponents::AddressIsNull; } // Otherwise, be conservative. There are crazy ways to capture pointers - // using comparisons. However, only the address is captured, not the - // provenance. - return CaptureComponents::Address; + // using comparisons. + return UseCaptureKind::MAY_CAPTURE; } default: // Something else - be conservative and say it is captured. - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; } } @@ -451,26 +438,18 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, }; while (!Worklist.empty()) { const Use *U = Worklist.pop_back_val(); - UseCaptureInfo CI = DetermineUseCaptureKind(*U, V, IsDereferenceableOrNull); - if (capturesAnything(CI.UseCC)) { - switch (Tracker->captured(U, CI)) { - case CaptureTracker::Stop: + switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: + continue; + case UseCaptureKind::MAY_CAPTURE: + if (Tracker->captured(U)) return; - case CaptureTracker::ContinueIgnoringReturn: - continue; - case CaptureTracker::Continue: - // Fall through to passthrough handling, but only if ResultCC contains - // additional components that UseCC does not. We assume that a - // capture at this point will be strictly more constraining than a - // later capture from following the return value. - if (capturesNothing(CI.ResultCC & ~CI.UseCC)) - continue; - break; - } + continue; + case UseCaptureKind::PASSTHROUGH: + if (!AddUses(U->getUser())) + return; + continue; } - // TODO(captures): We could keep track of ResultCC for the users. - if (capturesAnything(CI.ResultCC) && !AddUses(U->getUser())) - return; } // All uses examined. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index d25c1eecaf1ca..59002cd934ab1 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2788,8 +2788,7 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, struct CustomCaptureTracker : public CaptureTracker { bool Captured = false; void tooManyUses() override { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { if (auto *ICmp = dyn_cast(U->getUser())) { // Comparison against value stored in global variable. Given the // pointer does not escape, its value cannot be guessed and stored @@ -2797,11 +2796,11 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, unsigned OtherIdx = 1 - U->getOperandNo(); auto *LI = dyn_cast(ICmp->getOperand(OtherIdx)); if (LI && isa(LI->getPointerOperand())) - return Continue; + return false; } Captured = true; - return Stop; + return true; } }; CustomCaptureTracker Tracker; diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b5d1bc81b9d95..e2d607368e94b 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -711,20 +711,6 @@ CaptureInfo CallBase::getCaptureInfo(unsigned OpNo) const { return OBU.isDeoptOperandBundle() ? CaptureInfo::none() : CaptureInfo::all(); } -bool CallBase::hasArgumentWithAdditionalReturnCaptureComponents() const { - for (unsigned I = 0, E = arg_size(); I < E; ++I) { - if (!getArgOperand(I)->getType()->isPointerTy()) - continue; - - CaptureInfo CI = getParamAttributes(I).getCaptureInfo(); - if (auto *Fn = dyn_cast(getCalledOperand())) - CI &= Fn->getAttributes().getParamAttrs(I).getCaptureInfo(); - if (capturesAnything(CI.getRetComponents() & ~CI.getOtherComponents())) - return true; - } - return false; -} - //===----------------------------------------------------------------------===// // CallInst Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index c1dd8bc393f33..17e7fada10827 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3970,17 +3970,18 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // TODO: We should track the capturing uses in AANoCapture but the problem // is CGSCC runs. For those we would need to "allow" AANoCapture for // a value in the module slice. - // TODO(captures): Make this more precise. - UseCaptureInfo CI = - DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); - if (capturesNothing(CI)) + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: return true; - if (CI.isPassthrough()) { + case UseCaptureKind::MAY_CAPTURE: + LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI + << "\n"); + return false; + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI << "\n"); - return false; + llvm_unreachable("unknown UseCaptureKind"); }; bool IsKnownNoCapture; @@ -6018,16 +6019,16 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { }; auto UseCheck = [&](const Use &U, bool &Follow) -> bool { - // TODO(captures): Make this more precise. - UseCaptureInfo CI = - DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); - if (capturesNothing(CI)) + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: return true; - if (CI.isPassthrough()) { + case UseCaptureKind::MAY_CAPTURE: + return checkUse(A, T, U, Follow); + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - return checkUse(A, T, U, Follow); + llvm_unreachable("Unexpected use capture kind!"); }; if (!A.checkForAllUses(UseCheck, *this, *V)) @@ -12150,13 +12151,16 @@ struct AAGlobalValueInfoFloating : public AAGlobalValueInfo { auto UsePred = [&](const Use &U, bool &Follow) -> bool { Uses.insert(&U); - // TODO(captures): Make this more precise. - UseCaptureInfo CI = DetermineUseCaptureKind(U, /*Base=*/nullptr, nullptr); - if (CI.isPassthrough()) { + switch (DetermineUseCaptureKind(U, nullptr)) { + case UseCaptureKind::NO_CAPTURE: + return checkUse(A, U, Follow, Worklist); + case UseCaptureKind::MAY_CAPTURE: + return checkUse(A, U, Follow, Worklist); + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - return checkUse(A, U, Follow, Worklist); + return true; }; auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) { Uses.insert(&OldU); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 02b0fcb3981a7..a63e38a7d98ad 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -71,9 +71,7 @@ using namespace llvm; #define DEBUG_TYPE "function-attrs" STATISTIC(NumMemoryAttr, "Number of functions with improved memory attribute"); -STATISTIC(NumCapturesNone, "Number of arguments marked captures(none)"); -STATISTIC(NumCapturesPartial, "Number of arguments marked with captures " - "attribute other than captures(none)"); +STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReturned, "Number of arguments marked returned"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); @@ -110,13 +108,6 @@ static cl::opt DisableThinLTOPropagation( "disable-thinlto-funcattrs", cl::init(true), cl::Hidden, cl::desc("Don't propagate function-attrs in thinLTO")); -static void addCapturesStat(CaptureInfo CI) { - if (capturesNothing(CI)) - ++NumCapturesNone; - else - ++NumCapturesPartial; -} - namespace { using SCCNodeSet = SmallSetVector; @@ -507,9 +498,6 @@ namespace { /// SCC of the arguments. struct ArgumentGraphNode { Argument *Definition; - /// CaptureComponents for this argument, excluding captures via Uses. - /// We don't distinguish between other/return captures here. - CaptureComponents CC = CaptureComponents::None; SmallVector Uses; }; @@ -551,36 +539,18 @@ class ArgumentGraph { struct ArgumentUsesTracker : public CaptureTracker { ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {} - void tooManyUses() override { CI = CaptureInfo::all(); } - - Action captured(const Use *U, UseCaptureInfo UseCI) override { - if (updateCaptureInfo(U, UseCI.UseCC)) { - // Don't bother continuing if we already capture everything. - if (capturesAll(CI.getOtherComponents())) - return Stop; - return Continue; - } - - // For SCC argument tracking, we're not going to analyze other/ret - // components separately, so don't follow the return value. - return ContinueIgnoringReturn; - } + void tooManyUses() override { Captured = true; } - bool updateCaptureInfo(const Use *U, CaptureComponents CC) { + bool captured(const Use *U) override { CallBase *CB = dyn_cast(U->getUser()); if (!CB) { - if (isa(U->getUser())) - CI |= CaptureInfo::retOnly(CC); - else - // Conservatively assume that the captured value might make its way - // into the return value as well. This could be made more precise. - CI |= CaptureInfo(CC); + Captured = true; return true; } Function *F = CB->getCalledFunction(); if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) { - CI |= CaptureInfo(CC); + Captured = true; return true; } @@ -594,24 +564,22 @@ struct ArgumentUsesTracker : public CaptureTracker { // use. In this case it does not matter if the callee is within our SCC // or not -- we've been captured in some unknown way, and we have to be // conservative. - CI |= CaptureInfo(CC); + Captured = true; return true; } if (UseIndex >= F->arg_size()) { assert(F->isVarArg() && "More params than args in non-varargs call"); - CI |= CaptureInfo(CC); + Captured = true; return true; } - // TODO(captures): Could improve precision by remembering maximum - // capture components for the argument. Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); return false; } - // Does not include potential captures via Uses in the SCC. - CaptureInfo CI = CaptureInfo::none(); + // True only if certainly captured (used outside our SCC). + bool Captured = false; // Uses within our SCC. SmallVector Uses; @@ -1226,15 +1194,6 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, bool SkipInitializes) { ArgumentGraph AG; - auto DetermineAccessAttrsForSingleton = [](Argument *A) { - SmallPtrSet Self; - Self.insert(A); - Attribute::AttrKind R = determinePointerAccessAttrs(A, Self); - if (R != Attribute::None) - return addAccessAttr(A, R); - return false; - }; - // Check each function in turn, determining which pointer arguments are not // captured. for (Function *F : SCCNodes) { @@ -1255,7 +1214,7 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (A.getType()->isPointerTy() && !A.hasNoCaptureAttr()) { A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), CaptureInfo::none())); - ++NumCapturesNone; + ++NumNoCapture; Changed.insert(F); } } @@ -1266,23 +1225,21 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (!A.getType()->isPointerTy()) continue; bool HasNonLocalUses = false; - CaptureInfo OrigCI = A.getAttributes().getCaptureInfo(); - if (!capturesNothing(OrigCI)) { + if (!A.hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); PointerMayBeCaptured(&A, &Tracker); - CaptureInfo NewCI = Tracker.CI & OrigCI; - if (NewCI != OrigCI) { + if (!Tracker.Captured) { if (Tracker.Uses.empty()) { - // If the information is complete, add the attribute now. - A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), NewCI)); - addCapturesStat(NewCI); + // If it's trivially not captured, mark it nocapture now. + A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), + CaptureInfo::none())); + ++NumNoCapture; Changed.insert(F); } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. ArgumentGraphNode *Node = AG[&A]; - Node->CC = CaptureComponents(NewCI); for (Argument *Use : Tracker.Uses) { Node->Uses.push_back(AG[Use]); if (Use != &A) @@ -1297,8 +1254,12 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // an SCC? Note that we don't allow any calls at all here, or else our // result will be dependent on the iteration order through the // functions in the SCC. - if (DetermineAccessAttrsForSingleton(&A)) - Changed.insert(F); + SmallPtrSet Self; + Self.insert(&A); + Attribute::AttrKind R = determinePointerAccessAttrs(&A, Self); + if (R != Attribute::None) + if (addAccessAttr(&A, R)) + Changed.insert(F); } if (!SkipInitializes && !A.onlyReadsMemory()) { if (inferInitializes(A, *F)) @@ -1324,17 +1285,17 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { Argument *A = ArgumentSCC[0]->Definition; - CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); - CaptureInfo NewCI = CaptureInfo(ArgumentSCC[0]->CC) & OrigCI; - if (NewCI != OrigCI) { - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); - addCapturesStat(NewCI); - Changed.insert(A->getParent()); - } - - // Infer the access attributes given the new captures one - if (DetermineAccessAttrsForSingleton(A)) - Changed.insert(A->getParent()); + A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), + CaptureInfo::none())); + ++NumNoCapture; + Changed.insert(A->getParent()); + + // Infer the access attributes given the new nocapture one + SmallPtrSet Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); + if (R != Attribute::None) + addAccessAttr(A, R); } continue; } @@ -1346,45 +1307,27 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, ArgumentSCCNodes.insert(I->Definition); } - // At the SCC level, only track merged CaptureComponents. We're not - // currently prepared to handle propagation of return-only captures across - // the SCC. - CaptureComponents CC = CaptureComponents::None; + bool SCCCaptured = false; for (ArgumentGraphNode *N : ArgumentSCC) { for (ArgumentGraphNode *Use : N->Uses) { Argument *A = Use->Definition; - if (ArgumentSCCNodes.count(A)) - CC |= Use->CC; - else - CC |= CaptureComponents(A->getAttributes().getCaptureInfo()); + if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) + continue; + SCCCaptured = true; break; } - if (capturesAll(CC)) + if (SCCCaptured) break; } - - if (!capturesAll(CC)) { - for (ArgumentGraphNode *N : ArgumentSCC) { - Argument *A = N->Definition; - CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); - CaptureInfo NewCI = CaptureInfo(N->CC | CC) & OrigCI; - if (NewCI != OrigCI) { - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); - addCapturesStat(NewCI); - Changed.insert(A->getParent()); - } - } - } - - // TODO(captures): Ignore address-only captures. - if (capturesAnything(CC)) { - // As the pointer may be captured, determine the pointer attributes - // looking at each argument invidivually. - for (ArgumentGraphNode *N : ArgumentSCC) { - if (DetermineAccessAttrsForSingleton(N->Definition)) - Changed.insert(N->Definition->getParent()); - } + if (SCCCaptured) continue; + + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; + A->addAttr( + Attribute::getWithCaptureInfo(A->getContext(), CaptureInfo::none())); + ++NumNoCapture; + Changed.insert(A->getParent()); } // We also want to compute readonly/readnone/writeonly. With a small number diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 76020d2b1dbf4..00a8117f32e70 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -882,8 +882,7 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { void tooManyUses() override { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { auto *ICmp = dyn_cast(U->getUser()); // We need to check that U is based *only* on the alloca, and doesn't // have other contributions from a select/phi operand. @@ -893,11 +892,11 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { // Collect equality icmps of the alloca, and don't treat them as // captures. ICmps[ICmp] |= 1u << U->getOperandNo(); - return Continue; + return false; } Captured = true; - return Stop; + return true; } }; diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9a729b7afb8b9..87b27beb01a0a 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1550,33 +1550,32 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, } if (!Visited.insert(&U).second) continue; - UseCaptureInfo CI = - DetermineUseCaptureKind(U, AI, IsDereferenceableOrNull); - // TODO(captures): Make this more precise. - if (CI.isPassthrough()) { + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::MAY_CAPTURE: + return false; + case UseCaptureKind::PASSTHROUGH: + // Instructions cannot have non-instruction users. Worklist.push_back(UI); continue; - } - - if (capturesAnything(CI)) - return false; - - if (UI->isLifetimeStartOrEnd()) { - // We note the locations of these intrinsic calls so that we can - // delete them later if the optimization succeeds, this is safe - // since both llvm.lifetime.start and llvm.lifetime.end intrinsics - // practically fill all the bytes of the alloca with an undefined - // value, although conceptually marked as alive/dead. - int64_t Size = cast(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; + case UseCaptureKind::NO_CAPTURE: { + if (UI->isLifetimeStartOrEnd()) { + // We note the locations of these intrinsic calls so that we can + // delete them later if the optimization succeeds, this is safe + // since both llvm.lifetime.start and llvm.lifetime.end intrinsics + // practically fill all the bytes of the alloca with an undefined + // value, although conceptually marked as alive/dead. + int64_t Size = cast(UI->getOperand(0))->getSExtValue(); + if (Size < 0 || Size == DestSize) { + LifetimeMarkers.push_back(UI); + continue; + } } + if (UI->hasMetadata(LLVMContext::MD_noalias)) + NoAliasInstrs.insert(UI); + if (!ModRefCallback(UI)) + return false; + } } - if (UI->hasMetadata(LLVMContext::MD_noalias)) - NoAliasInstrs.insert(UI); - if (!ModRefCallback(UI)) - return false; } } return true; diff --git a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll index a3b065667702f..f706184f9727e 100644 --- a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll +++ b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll @@ -14,7 +14,7 @@ define ptr @b(ptr %q) { ret ptr %tmp } -; CHECK: define ptr @c(ptr readnone returned captures(address_is_null, ret: address, provenance) %r) +; CHECK: define ptr @c(ptr readnone returned %r) @g = global i32 0 define ptr @c(ptr %r) { %a = icmp eq ptr %r, null diff --git a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll index 99406696d33d1..13954694eefe0 100644 --- a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll +++ b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll @@ -145,8 +145,8 @@ return: ; preds = %cond.end, %if.then3 ; TEST SCC test returning a pointer value argument ; -; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned captures(ret: address, provenance) %r) -; FNATTR: define ptr @ptr_scc_r1(ptr readnone %a, ptr readnone %r, ptr readnone captures(none) %b) +; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned %r) +; FNATTR: define ptr @ptr_scc_r1(ptr %a, ptr readnone %r, ptr readnone captures(none) %b) ; FNATTR: define ptr @ptr_scc_r2(ptr readnone %a, ptr readnone %b, ptr readnone %r) ; ; @@ -260,8 +260,8 @@ entry: ; TEST another SCC test ; -; FNATTR: define ptr @rt2_helper(ptr readnone captures(address_is_null) %a) -; FNATTR: define ptr @rt2(ptr readnone captures(address_is_null) %a, ptr readnone captures(ret: address, provenance) %b) +; FNATTR: define ptr @rt2_helper(ptr %a) +; FNATTR: define ptr @rt2(ptr readnone %a, ptr readnone %b) define ptr @rt2_helper(ptr %a) #0 { entry: %call = call ptr @rt2(ptr %a, ptr %a) @@ -284,8 +284,8 @@ if.end: ; TEST another SCC test ; -; FNATTR: define ptr @rt3_helper(ptr readnone captures(address_is_null) %a, ptr readnone %b) -; FNATTR: define ptr @rt3(ptr readnone captures(address_is_null) %a, ptr readnone %b) +; FNATTR: define ptr @rt3_helper(ptr %a, ptr %b) +; FNATTR: define ptr @rt3(ptr readnone %a, ptr readnone %b) define ptr @rt3_helper(ptr %a, ptr %b) #0 { entry: %call = call ptr @rt3(ptr %a, ptr %b) @@ -316,7 +316,7 @@ if.end: ; } ; ; -; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned captures(ret: address, provenance) %r) +; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned %r) declare void @unknown_fn(ptr) #0 define ptr @calls_unknown_fn(ptr %r) #0 { @@ -415,7 +415,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @bitcast(ptr readnone returned captures(ret: address, provenance) %b) +; FNATTR: define ptr @bitcast(ptr readnone returned %b) ; define ptr @bitcast(ptr %b) #0 { entry: @@ -433,7 +433,7 @@ entry: ; } ; ; -; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone %b) ; define ptr @bitcasts_select_and_phi(ptr %b) #0 { entry: @@ -462,7 +462,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone %b) ; define ptr @ret_arg_arg_undef(ptr %b) #0 { entry: @@ -494,7 +494,7 @@ ret_undef: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone %b) ; define ptr @ret_undef_arg_arg(ptr %b) #0 { entry: @@ -526,7 +526,7 @@ ret_arg1: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone %b) define ptr @ret_undef_arg_undef(ptr %b) #0 { entry: %cmp = icmp eq ptr %b, null diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index 6debe5de3966e..6164f2adbf5b9 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -7,7 +7,7 @@ define ptr @c1(ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @c1 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[Q:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned [[Q:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[Q]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) @@ -512,7 +512,7 @@ define void @test4_1(ptr %x4_1, i1 %c) { define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @test4_2 -; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned captures(ret: address, provenance) [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { +; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; FNATTRS: t: ; FNATTRS-NEXT: call void @test4_1(ptr null, i1 [[C]]) @@ -740,7 +740,7 @@ define void @captureStrip(ptr %p) { define i1 @captureICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmp -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -757,7 +757,7 @@ define i1 @captureICmp(ptr %x) { define i1 @captureICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmpRev -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr null, [[X]] ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -771,29 +771,10 @@ define i1 @captureICmpRev(ptr %x) { ret i1 %1 } -define i1 @captureICmpWrongPred(ptr %x) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @captureICmpWrongPred -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null -; FNATTRS-NEXT: ret i1 [[TMP1]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @captureICmpWrongPred -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] -; - %1 = icmp slt ptr %x, null - ret i1 %1 -} - -; We could infer captures(address_is_null) here, but don't bother, because -; InstCombine will optimize the GEP away. define i1 @nocaptureInboundsGEPICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -813,7 +794,7 @@ define i1 @nocaptureInboundsGEPICmp(ptr %x) { define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmpRev -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr null, [[TMP1]] ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -830,46 +811,6 @@ define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ret i1 %2 } -define i1 @notInboundsGEPICmp(ptr %x) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @notInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; FNATTRS-NEXT: ret i1 [[TMP2]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @notInboundsGEPICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] -; - %1 = getelementptr i32, ptr %x, i32 5 - %2 = icmp eq ptr %1, null - ret i1 %2 -} - -define i1 @inboundsGEPICmpNullPointerDefined(ptr %x) null_pointer_is_valid { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) -; FNATTRS-LABEL: define i1 @inboundsGEPICmpNullPointerDefined -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR16:[0-9]+]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; FNATTRS-NEXT: ret i1 [[TMP2]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @inboundsGEPICmpNullPointerDefined -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR12:[0-9]+]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] -; - %1 = getelementptr i32, ptr %x, i32 5 - %2 = icmp eq ptr %1, null - ret i1 %2 -} - define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @nocaptureDereferenceableOrNullICmp @@ -890,13 +831,13 @@ define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) define i1 @captureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) null_pointer_is_valid { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @captureDereferenceableOrNullICmp -; FNATTRS-SAME: (ptr readnone captures(address_is_null) dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16]] { +; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; ATTRIBUTOR-LABEL: define i1 @captureDereferenceableOrNullICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12]] { +; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] ; @@ -962,7 +903,7 @@ define void @readnone_indirec(ptr %f, ptr %p) { define ptr @captures_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @captures_ret_only -; FNATTRS-SAME: (ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 8 ; FNATTRS-NEXT: ret ptr [[GEP]] ; @@ -976,8 +917,6 @@ define ptr @captures_ret_only(ptr %p) { ret ptr %gep } -; Even though the ptrtoint is only used in the return value, this should *not* -; be considered a read-only capture. define i64 @captures_not_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i64 @captures_not_ret_only @@ -996,52 +935,35 @@ define i64 @captures_not_ret_only(ptr %p) { } define void @captures_read_provenance(ptr %p) { -; FNATTRS-LABEL: define void @captures_read_provenance -; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { -; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @captures_read_provenance -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define void @captures_read_provenance +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; COMMON-NEXT: ret void ; call void @capture(ptr captures(address, read_provenance) %p) ret void } define void @captures_unused_ret(ptr %p) { -; FNATTRS-LABEL: define void @captures_unused_ret -; FNATTRS-SAME: (ptr captures(address_is_null) [[P:%.*]]) { -; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @captures_unused_ret -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define void @captures_unused_ret +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; COMMON-NEXT: ret void ; call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret void } define ptr @captures_used_ret(ptr %p) { -; FNATTRS-LABEL: define ptr @captures_used_ret -; FNATTRS-SAME: (ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) { -; FNATTRS-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret ptr [[RET]] -; -; ATTRIBUTOR-LABEL: define ptr @captures_used_ret -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret ptr [[RET]] +; COMMON-LABEL: define ptr @captures_used_ret +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; COMMON-NEXT: ret ptr [[RET]] ; %ret = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret ptr %ret } -; Make sure this is does not produce captures(ret: ...). We need to take the -; return capture components into account when handling argument SCCs. define ptr @scc_capture_via_ret(i1 %c, ptr %p) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @scc_capture_via_ret @@ -1077,72 +999,5 @@ else: ret ptr %p } -define i1 @improve_existing_captures(ptr captures(address) %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @improve_existing_captures -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[P:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null -; FNATTRS-NEXT: ret i1 [[CMP]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @improve_existing_captures -; ATTRIBUTOR-SAME: (ptr nofree readnone captures(address) [[P:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null -; ATTRIBUTOR-NEXT: ret i1 [[CMP]] -; - %cmp = icmp eq ptr %p, null - ret i1 %cmp -} - -define void @dont_increase_existing_captures(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @capture(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_trivial_scc(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures_trivial_scc -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; COMMON-NEXT: call void @dont_increase_existing_captures_trivial_scc(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @capture(ptr captures(address, read_provenance) %p) - call void @dont_increase_existing_captures_trivial_scc(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_scc1(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures_scc1 -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @dont_increase_existing_captures_scc2(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @dont_increase_existing_captures_scc2(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_scc2(ptr %p) { -; FNATTRS-LABEL: define void @dont_increase_existing_captures_scc2 -; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { -; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; FNATTRS-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @dont_increase_existing_captures_scc2 -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) -; ATTRIBUTOR-NEXT: ret void -; - call void @capture(ptr captures(address, read_provenance) %p) - call void @dont_increase_existing_captures_scc1(ptr %p) - ret void -} - declare ptr @llvm.launder.invariant.group.p0(ptr) declare ptr @llvm.strip.invariant.group.p0(ptr) diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index 94093568419af..0f6762f0d4342 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -19,7 +19,7 @@ define ptr @test1() { ; Return a pointer trivially nonnull (argument attribute) define ptr @test2(ptr nonnull %p) { ; FNATTRS-LABEL: define nonnull ptr @test2( -; FNATTRS-SAME: ptr nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: ptr nonnull readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[P]] ; ; ATTRIBUTOR-LABEL: define nonnull ptr @test2( @@ -194,7 +194,7 @@ exit: define ptr @test7(ptr %a) { ; FNATTRS-LABEL: define ptr @test7( -; FNATTRS-SAME: ptr readnone returned captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone returned [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr [[A]] ; ; ATTRIBUTOR-LABEL: define ptr @test7( @@ -206,7 +206,7 @@ define ptr @test7(ptr %a) { define ptr @test8(ptr %a) { ; FNATTRS-LABEL: define nonnull ptr @test8( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -221,7 +221,7 @@ define ptr @test8(ptr %a) { define ptr @test9(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test9( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -238,7 +238,7 @@ declare void @llvm.assume(i1) ; FIXME: missing nonnull define ptr @test10(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test10( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: [[CMP:%.*]] = icmp ne i64 [[N]], 0 ; FNATTRS-NEXT: call void @llvm.assume(i1 [[CMP]]) ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] @@ -263,7 +263,7 @@ define ptr @test10(ptr %a, i64 %n) { ; } define ptr @test11(ptr) local_unnamed_addr { ; FNATTRS-LABEL: define nonnull ptr @test11( -; FNATTRS-SAME: ptr readnone captures(address_is_null, ret: address, provenance) [[TMP0:%.*]]) local_unnamed_addr { +; FNATTRS-SAME: ptr readnone [[TMP0:%.*]]) local_unnamed_addr { ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null ; FNATTRS-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]] ; FNATTRS: 3: @@ -362,7 +362,7 @@ declare nonnull ptr @nonnull() define internal ptr @f1(ptr %arg) { ; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f1( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { +; FNATTRS-SAME: ptr readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = icmp eq ptr [[ARG]], null ; FNATTRS-NEXT: br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]] @@ -431,7 +431,7 @@ bb9: ; preds = %bb4, %bb define internal ptr @f2(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f2(ptr nonnull %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f2( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = tail call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -452,7 +452,7 @@ bb: define dso_local noalias ptr @f3(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f3(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define dso_local noalias nonnull ptr @f3( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -945,7 +945,7 @@ exc: define ptr @gep1(ptr %p) { ; FNATTRS-LABEL: define nonnull ptr @gep1( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -961,7 +961,7 @@ define ptr @gep1(ptr %p) { define ptr @gep1_no_null_opt(ptr %p) #0 { ; Should't be able to derive nonnull based on gep. ; FNATTRS-LABEL: define ptr @gep1_no_null_opt( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR8:[0-9]+]] { +; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR8:[0-9]+]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -976,7 +976,7 @@ define ptr @gep1_no_null_opt(ptr %p) #0 { define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FNATTRS-LABEL: define ptr addrspace(3) @gep2( -; FNATTRS-SAME: ptr addrspace(3) readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 1 ; FNATTRS-NEXT: ret ptr addrspace(3) [[Q]] ; @@ -992,7 +992,7 @@ define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FIXME: We should propagate dereferenceable here but *not* nonnull define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) { ; FNATTRS-LABEL: define noundef ptr addrspace(3) @as( -; FNATTRS-SAME: ptr addrspace(3) readnone returned captures(ret: address, provenance) dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone returned dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr addrspace(3) [[P]] ; ; ATTRIBUTOR-LABEL: define ptr addrspace(3) @as( @@ -1383,7 +1383,7 @@ define void @PR43833_simple(ptr %0, i32 %1) { define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) { ; FNATTRS-LABEL: define ptr @pr91177_non_inbounds_gep( -; FNATTRS-SAME: ptr nonnull readnone captures(ret: address, provenance) [[ARG:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr nonnull readnone [[ARG:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[RES:%.*]] = getelementptr i8, ptr [[ARG]], i64 -8 ; FNATTRS-NEXT: ret ptr [[RES]] ; diff --git a/llvm/test/Transforms/FunctionAttrs/noundef.ll b/llvm/test/Transforms/FunctionAttrs/noundef.ll index 4f53c08804621..b7c583880501a 100644 --- a/llvm/test/Transforms/FunctionAttrs/noundef.ll +++ b/llvm/test/Transforms/FunctionAttrs/noundef.ll @@ -169,7 +169,7 @@ define i64 @test_trunc_with_constexpr() { define align 4 ptr @maybe_not_aligned(ptr noundef %p) { ; CHECK-LABEL: define align 4 ptr @maybe_not_aligned( -; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -177,7 +177,7 @@ define align 4 ptr @maybe_not_aligned(ptr noundef %p) { define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { ; CHECK-LABEL: define noundef align 4 ptr @definitely_aligned( -; CHECK-SAME: ptr noundef readnone returned align 4 captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned align 4 [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -185,7 +185,7 @@ define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { ; CHECK-LABEL: define nonnull ptr @maybe_not_nonnull( -; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -193,7 +193,7 @@ define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { define nonnull ptr @definitely_nonnull(ptr noundef nonnull %p) { ; CHECK-LABEL: define noundef nonnull ptr @definitely_nonnull( -; CHECK-SAME: ptr noundef nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef nonnull readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index 5fc88d623c0ec..b24c097ad54d0 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -35,7 +35,7 @@ define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) { define ptr @test2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define {{[^@]+}}@test2 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: store i32 0, ptr @x, align 4 ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -58,7 +58,7 @@ define ptr @test2(ptr %p) { define i1 @test3(ptr %p, ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test3 -; FNATTRS-SAME: (ptr readnone captures(address) [[P:%.*]], ptr readnone captures(address) [[Q:%.*]]) #[[ATTR1:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone [[P:%.*]], ptr readnone [[Q:%.*]]) #[[ATTR1:[0-9]+]] { ; FNATTRS-NEXT: [[A:%.*]] = icmp ult ptr [[P]], [[Q]] ; FNATTRS-NEXT: ret i1 [[A]] ; @@ -197,7 +197,7 @@ define void @test7_2(ptr preallocated(i32) %a) { define ptr @test8_1(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test8_1 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR1]] { +; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -220,7 +220,7 @@ entry: define void @test8_2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) ; FNATTRS-LABEL: define {{[^@]+}}@test8_2 -; FNATTRS-SAME: (ptr writeonly captures(none) [[P:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: [[CALL:%.*]] = call ptr @test8_1(ptr [[P]]) ; FNATTRS-NEXT: store i32 10, ptr [[CALL]], align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/stats.ll b/llvm/test/Transforms/FunctionAttrs/stats.ll index dc0387e57174a..5f007b4078ff3 100644 --- a/llvm/test/Transforms/FunctionAttrs/stats.ll +++ b/llvm/test/Transforms/FunctionAttrs/stats.ll @@ -16,8 +16,8 @@ entry: ret void } -; CHECK: 1 function-attrs - Number of arguments marked captures(none) -; CHECK-NEXT: 2 function-attrs - Number of functions with improved memory attribute +; CHECK: 2 function-attrs - Number of functions with improved memory attribute +; CHECK-NEXT: 1 function-attrs - Number of arguments marked nocapture ; CHECK-NEXT: 1 function-attrs - Number of functions marked as nofree ; CHECK-NEXT: 2 function-attrs - Number of functions marked as norecurse ; CHECK-NEXT: 2 function-attrs - Number of functions marked as nosync diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll index 7175816963ed1..e01dba328a3a1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll @@ -9,7 +9,7 @@ target triple = "aarch64" define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 { ; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_( -; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(address_is_null) [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0 ; CHECK-NEXT: br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll index d5edf83ee52e2..bbd4849c32296 100644 --- a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll +++ b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll @@ -12,7 +12,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll index 4b422f205138a..ee7698b116aa2 100644 --- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll +++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll @@ -14,7 +14,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll index cd2ed37b22db5..5f75bd788e4bb 100644 --- a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll +++ b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll @@ -11,7 +11,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-LABEL: define void @test( -; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; NOROTATION-NEXT: entry: ; NOROTATION-NEXT: br label [[LOOP_HEADER:%.*]] ; NOROTATION: loop.header: @@ -26,7 +26,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-NEXT: ret void ; ; ROTATION-LABEL: define void @test( -; ROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; ROTATION-NEXT: entry: ; ROTATION-NEXT: [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]] ; ROTATION-NEXT: br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]] diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp index 3f5c10d935167..73dd82fb921f7 100644 --- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp +++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp @@ -77,9 +77,9 @@ TEST(CaptureTracking, MaxUsesToExplore) { struct CollectingCaptureTracker : public CaptureTracker { SmallVector Captures; void tooManyUses() override { } - Action captured(const Use *U, UseCaptureInfo CI) override { + bool captured(const Use *U) override { Captures.push_back(U); - return Continue; + return false; } }; From 70e693c77f0044643f3a301a0b7bc334a6e558c9 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 19 Feb 2025 08:34:15 -0800 Subject: [PATCH 079/220] [lldb] Gardening in StreamAsynchronousIO (NFC) (#127717) A handful of minor improvements to StreamAsynchronousIO: - Document the class. - Use a named enum value to distinguishing between stdout and stderr. - Add missing period to comment. - Clear the string instead of assigning to it. - Eliminate color argument. --- lldb/include/lldb/Core/StreamAsynchronousIO.h | 12 ++++++++++-- lldb/source/Core/Debugger.cpp | 6 ++++-- lldb/source/Core/StreamAsynchronousIO.cpp | 12 ++++++------ 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/lldb/include/lldb/Core/StreamAsynchronousIO.h b/lldb/include/lldb/Core/StreamAsynchronousIO.h index b7adbc42096ce..7ae65757e2d73 100644 --- a/lldb/include/lldb/Core/StreamAsynchronousIO.h +++ b/lldb/include/lldb/Core/StreamAsynchronousIO.h @@ -18,9 +18,17 @@ namespace lldb_private { class Debugger; +/// A stream meant for asynchronously printing output. Output is buffered until +/// the stream is flushed or destroyed. Printing is handled by the currently +/// active IOHandler, or the debugger's output or error stream if there is none. class StreamAsynchronousIO : public Stream { public: - StreamAsynchronousIO(Debugger &debugger, bool for_stdout, bool colors); + enum ForSTDOUT : bool { + STDOUT = true, + STDERR = false, + }; + + StreamAsynchronousIO(Debugger &debugger, ForSTDOUT for_stdout); ~StreamAsynchronousIO() override; @@ -32,7 +40,7 @@ class StreamAsynchronousIO : public Stream { private: Debugger &m_debugger; std::string m_data; - bool m_for_stdout; + ForSTDOUT m_for_stdout; }; } // namespace lldb_private diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 18cdec4e0af73..8b7814d434ee9 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -1321,11 +1321,13 @@ bool Debugger::PopIOHandler(const IOHandlerSP &pop_reader_sp) { } StreamSP Debugger::GetAsyncOutputStream() { - return std::make_shared(*this, true, GetUseColor()); + return std::make_shared(*this, + StreamAsynchronousIO::STDOUT); } StreamSP Debugger::GetAsyncErrorStream() { - return std::make_shared(*this, false, GetUseColor()); + return std::make_shared(*this, + StreamAsynchronousIO::STDERR); } void Debugger::RequestInterrupt() { diff --git a/lldb/source/Core/StreamAsynchronousIO.cpp b/lldb/source/Core/StreamAsynchronousIO.cpp index c2c64b61ab726..dbd56a69675b4 100644 --- a/lldb/source/Core/StreamAsynchronousIO.cpp +++ b/lldb/source/Core/StreamAsynchronousIO.cpp @@ -14,20 +14,20 @@ using namespace lldb; using namespace lldb_private; -StreamAsynchronousIO::StreamAsynchronousIO(Debugger &debugger, bool for_stdout, - bool colors) - : Stream(0, 4, eByteOrderBig, colors), m_debugger(debugger), m_data(), - m_for_stdout(for_stdout) {} +StreamAsynchronousIO::StreamAsynchronousIO( + Debugger &debugger, StreamAsynchronousIO::ForSTDOUT for_stdout) + : Stream(0, 4, eByteOrderBig, debugger.GetUseColor()), m_debugger(debugger), + m_data(), m_for_stdout(for_stdout) {} StreamAsynchronousIO::~StreamAsynchronousIO() { - // Flush when we destroy to make sure we display the data + // Flush when we destroy to make sure we display the data. Flush(); } void StreamAsynchronousIO::Flush() { if (!m_data.empty()) { m_debugger.PrintAsync(m_data.data(), m_data.size(), m_for_stdout); - m_data = std::string(); + m_data.clear(); } } From ab3d793982acb946afc2028ca41304913879c6c9 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Wed, 19 Feb 2025 11:36:45 -0500 Subject: [PATCH 080/220] [libc++] Optimize ranges::move{,_backward} for vector::iterator (#121109) As a follow-up to #121013 (which optimized `ranges::copy`) and #121026 (which optimized `ranges::copy_backward`), this PR enhances the performance of `std::ranges::{move, move_backward}` for `vector::iterator`, addressing a subtask outlined in issue #64038. The optimizations bring performance improvements analogous to those achieved for the `{copy, copy_backward}` algorithms: up to 2000x for aligned moves and 60x for unaligned moves. Moreover, comprehensive tests covering up to 4 storage words (256 bytes) with odd and even bit sizes are provided, which validate the proposed optimizations in this patch. --- libcxx/docs/ReleaseNotes/21.rst | 4 +- libcxx/include/__algorithm/move.h | 10 ++ libcxx/include/__algorithm/move_backward.h | 10 ++ libcxx/include/__bit_reference | 16 --- .../test/benchmarks/algorithms/move.bench.cpp | 71 +++++++++++ .../algorithms/move_backward.bench.cpp | 71 +++++++++++ .../alg.move/move.pass.cpp | 55 ++++++-- .../alg.move/move_backward.pass.cpp | 62 ++++++--- .../alg.move/ranges.move.pass.cpp | 108 +++++++++++----- .../alg.move/ranges.move_backward.pass.cpp | 119 +++++++++++++----- 10 files changed, 414 insertions(+), 112 deletions(-) create mode 100644 libcxx/test/benchmarks/algorithms/move.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/move_backward.bench.cpp diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 2439360797023..88a0666611a9a 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -43,8 +43,8 @@ Implemented Papers Improvements and New Features ----------------------------- -- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector::iterator``\s, - resulting in a performance improvement of up to 2000x. +- The ``std::ranges::{copy, copy_n, copy_backward, move, move_backward}`` algorithms have been optimized for + ``std::vector::iterator``, resulting in a performance improvement of up to 2000x. - Updated formatting library to Unicode 16.0.0. diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h index 6f3b0eb5d2927..a3320e9f1985d 100644 --- a/libcxx/include/__algorithm/move.h +++ b/libcxx/include/__algorithm/move.h @@ -9,11 +9,13 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_H #define _LIBCPP___ALGORITHM_MOVE_H +#include <__algorithm/copy.h> #include <__algorithm/copy_move_common.h> #include <__algorithm/for_each_segment.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/common_type.h> @@ -98,6 +100,14 @@ struct __move_impl { } } + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> > + operator()(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { + return std::__copy(__first, __last, __result); + } + // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h index 24a8d9b24527a..14482fee18114 100644 --- a/libcxx/include/__algorithm/move_backward.h +++ b/libcxx/include/__algorithm/move_backward.h @@ -9,10 +9,12 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_BACKWARD_H #define _LIBCPP___ALGORITHM_MOVE_BACKWARD_H +#include <__algorithm/copy_backward.h> #include <__algorithm/copy_move_common.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/common_type.h> @@ -107,6 +109,14 @@ struct __move_backward_impl { } } + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> > + operator()(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { + return std::__copy_backward<_ClassicAlgPolicy>(__first, __last, __result); + } + // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference index aad470394732c..377f5fed12266 100644 --- a/libcxx/include/__bit_reference +++ b/libcxx/include/__bit_reference @@ -210,22 +210,6 @@ private: __mask_(__m) {} }; -// move - -template -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> -move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - return std::copy(__first, __last, __result); -} - -// move_backward - -template -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> move_backward( - __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - return std::copy_backward(__first, __last, __result); -} - // swap_ranges template diff --git a/libcxx/test/benchmarks/algorithms/move.bench.cpp b/libcxx/test/benchmarks/algorithms/move.bench.cpp new file mode 100644 index 0000000000000..73f36f0c129de --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/move.bench.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include + +template +void bm_ranges_move_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + if constexpr (aligned) { + benchmark::DoNotOptimize(std::ranges::move(*in, std::ranges::begin(*out))); + } else { + benchmark::DoNotOptimize( + std::ranges::move(std::views::counted(in->begin() + 4, n - 4), std::ranges::begin(*out))); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +template +void bm_move_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + auto first1 = in->begin(); + auto last1 = in->end(); + auto first2 = out->begin(); + if constexpr (aligned) { + benchmark::DoNotOptimize(std::move(first1, last1, first2)); + } else { + benchmark::DoNotOptimize(std::move(first1 + 4, last1, first2)); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(bm_ranges_move_vb) + ->Name("bm_ranges_move_vb_aligned") + ->Range(8, 1 << 16) + ->DenseRange(102400, 204800, 4096); +BENCHMARK(bm_ranges_move_vb)->Name("bm_ranges_move_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK(bm_move_vb)->Name("bm_move_vb_aligned")->Range(8, 1 << 20); +BENCHMARK(bm_move_vb)->Name("bm_move_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp b/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp new file mode 100644 index 0000000000000..23d7395198419 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include + +template +void bm_ranges_move_backward_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + if constexpr (aligned) { + benchmark::DoNotOptimize(std::ranges::move_backward(*in, std::ranges::end(*out))); + } else { + benchmark::DoNotOptimize( + std::ranges::move_backward(std::views::counted(in->begin(), n - 4), std::ranges::end(*out))); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +template +void bm_move_backward_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + auto first1 = in->begin(); + auto last1 = in->end(); + auto last2 = out->end(); + if constexpr (aligned) { + benchmark::DoNotOptimize(std::move_backward(first1, last1, last2)); + } else { + benchmark::DoNotOptimize(std::move_backward(first1, last1 - 4, last2)); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(bm_ranges_move_backward_vb) + ->Name("bm_ranges_move_backward_vb_aligned") + ->Range(8, 1 << 16) + ->DenseRange(102400, 204800, 4096); +BENCHMARK(bm_ranges_move_backward_vb)->Name("bm_ranges_move_backward_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK(bm_move_backward_vb)->Name("bm_move_backward_vb_aligned")->Range(8, 1 << 20); +BENCHMARK(bm_move_backward_vb)->Name("bm_move_backward_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp index b1ad6873bc5e5..1afaa1a7e6da1 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include "MoveOnly.h" #include "test_iterators.h" @@ -45,15 +46,15 @@ struct Test { template TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::move(InIter(ia), InIter(ia+N), OutIter(ib)); - assert(base(r) == ib+N); + OutIter r = std::move(InIter(ia), InIter(ia + N), OutIter(ib)); + assert(base(r) == ib + N); for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); + assert(ia[i] == ib[i]); } }; @@ -73,13 +74,13 @@ struct Test1 { const unsigned N = 100; std::unique_ptr ia[N]; for (unsigned i = 0; i < N; ++i) - ia[i].reset(new int(i)); + ia[i].reset(new int(i)); std::unique_ptr ib[N]; - OutIter r = std::move(InIter(ia), InIter(ia+N), OutIter(ib)); - assert(base(r) == ib+N); + OutIter r = std::move(InIter(ia), InIter(ia + N), OutIter(ib)); + assert(base(r) == ib + N); for (unsigned i = 0; i < N; ++i) - assert(*ib[i] == static_cast(i)); + assert(*ib[i] == static_cast(i)); } }; @@ -92,6 +93,28 @@ struct Test1OutIters { } }; +TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move with aligned bytes + std::vector in(v); + std::vector out(N); + std::move(in.begin(), in.end(), out.begin()); + assert(out == v); + } + { // Test move with unaligned bytes + std::vector in(v); + std::vector out(N); + std::move(in.begin() + 4, in.end(), out.begin()); + for (std::size_t i = 0; i < N - 4; ++i) + assert(v[i + 4] == out[i]); + } + + return true; +} + TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::cpp17_input_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) @@ -118,7 +141,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When non-trivial { MoveOnly from[3] = {1, 2, 3}; - MoveOnly to[3] = {}; + MoveOnly to[3] = {}; std::move(std::begin(from), std::end(from), std::begin(to)); assert(to[0] == MoveOnly(1)); assert(to[1] == MoveOnly(2)); @@ -127,7 +150,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When trivial { TrivialMoveOnly from[3] = {1, 2, 3}; - TrivialMoveOnly to[3] = {}; + TrivialMoveOnly to[3] = {}; std::move(std::begin(from), std::end(from), std::begin(to)); assert(to[0] == TrivialMoveOnly(1)); assert(to[1] == TrivialMoveOnly(2)); @@ -135,6 +158,16 @@ TEST_CONSTEXPR_CXX20 bool test() { } } + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } + return true; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp index 61dea47b51071..3c0fcadb2d036 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include "MoveOnly.h" #include "test_iterators.h" @@ -44,24 +45,22 @@ struct Test { template TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::move_backward(InIter(ia), InIter(ia+N), OutIter(ib+N)); + OutIter r = std::move_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); + assert(ia[i] == ib[i]); } }; struct TestOutIters { template TEST_CONSTEXPR_CXX20 void operator()() { - types::for_each( - types::concatenate_t >(), - Test()); + types::for_each(types::concatenate_t >(), Test()); } }; @@ -72,24 +71,46 @@ struct Test1 { const unsigned N = 100; std::unique_ptr ia[N]; for (unsigned i = 0; i < N; ++i) - ia[i].reset(new int(i)); + ia[i].reset(new int(i)); std::unique_ptr ib[N]; - OutIter r = std::move_backward(InIter(ia), InIter(ia+N), OutIter(ib+N)); + OutIter r = std::move_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) - assert(*ib[i] == static_cast(i)); + assert(*ib[i] == static_cast(i)); } }; struct Test1OutIters { template TEST_CONSTEXPR_CXX23 void operator()() { - types::for_each(types::concatenate_t*> >(), - Test1()); + types::for_each( + types::concatenate_t*> >(), Test1()); } }; +TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move_backward with aligned bytes + std::vector in(v); + std::vector out(N); + std::move_backward(in.begin(), in.end(), out.end()); + assert(out == v); + } + { // Test move_backward with unaligned bytes + std::vector in(v); + std::vector out(N); + std::move_backward(in.begin(), in.end() - 4, out.end()); + for (std::size_t i = 0; i < N - 4; ++i) + assert(out[i + 4] == v[i]); + } + + return true; +} + TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::bidirectional_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) @@ -117,7 +138,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When non-trivial { MoveOnly from[3] = {1, 2, 3}; - MoveOnly to[3] = {}; + MoveOnly to[3] = {}; std::move_backward(std::begin(from), std::end(from), std::end(to)); assert(to[0] == MoveOnly(1)); assert(to[1] == MoveOnly(2)); @@ -126,7 +147,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When trivial { TrivialMoveOnly from[3] = {1, 2, 3}; - TrivialMoveOnly to[3] = {}; + TrivialMoveOnly to[3] = {}; std::move_backward(std::begin(from), std::end(from), std::end(to)); assert(to[0] == TrivialMoveOnly(1)); assert(to[1] == TrivialMoveOnly(2)); @@ -134,11 +155,20 @@ TEST_CONSTEXPR_CXX20 bool test() { } } + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } + return true; } -int main(int, char**) -{ +int main(int, char**) { test(); #if TEST_STD_VER >= 20 static_assert(test()); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp index a0d1473360a14..1a89408865892 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp @@ -31,6 +31,7 @@ #include "almost_satisfies_types.h" #include "MoveOnly.h" #include "test_iterators.h" +#include "test_macros.h" template > concept HasMoveIt = requires(In in, Sent sent, Out out) { std::ranges::move(in, sent, out); }; @@ -65,7 +66,7 @@ constexpr void test(std::array in) { { std::array out; std::same_as> decltype(auto) ret = - std::ranges::move(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data())); + std::ranges::move(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data() + out.size()); @@ -73,8 +74,7 @@ constexpr void test(std::array in) { { std::array out; auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size()))); - std::same_as> decltype(auto) ret = - std::ranges::move(range, Out(out.data())); + std::same_as> decltype(auto) ret = std::ranges::move(range, Out(out.data())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data() + out.size()); @@ -84,16 +84,16 @@ constexpr void test(std::array in) { template constexpr void test_containers() { { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); std::same_as> auto ret = - std::ranges::move(In(in.begin()), Sent(In(in.end())), Out(out.begin())); + std::ranges::move(In(in.begin()), Sent(In(in.end())), Out(out.begin())); assert(std::ranges::equal(in, out)); assert(base(ret.in) == in.end()); assert(base(ret.out) == out.end()); } { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end()))); std::same_as> auto ret = std::ranges::move(range, Out(out.begin())); @@ -165,22 +165,52 @@ constexpr void test_proxy_in_iterators() { } struct IteratorWithMoveIter { - using value_type = int; - using difference_type = int; + using value_type = int; + using difference_type = int; explicit IteratorWithMoveIter() = default; int* ptr; constexpr IteratorWithMoveIter(int* ptr_) : ptr(ptr_) {} constexpr int& operator*() const; // iterator with iter_move should not be dereferenced - constexpr IteratorWithMoveIter& operator++() { ++ptr; return *this; } - constexpr IteratorWithMoveIter operator++(int) { auto ret = *this; ++*this; return ret; } + constexpr IteratorWithMoveIter& operator++() { + ++ptr; + return *this; + } + constexpr IteratorWithMoveIter operator++(int) { + auto ret = *this; + ++*this; + return ret; + } friend constexpr int iter_move(const IteratorWithMoveIter&) { return 42; } constexpr bool operator==(const IteratorWithMoveIter& other) const = default; }; +#if TEST_STD_VER >= 23 +constexpr bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move with aligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move(in, out.begin()); + assert(out == v); + } + { // Test move with unaligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move(std::views::counted(in.begin() + 4, N - 4), out.begin()); + assert(std::ranges::equal(v | std::views::drop(4), out | std::views::take(N - 4))); + } + + return true; +} +#endif + // cpp17_intput_iterator has a defaulted template argument template using Cpp17InIter = cpp17_input_iterator; @@ -267,13 +297,13 @@ constexpr bool test() { { // check that ranges::dangling is returned std::array out; std::same_as> decltype(auto) ret = - std::ranges::move(std::array {1, 2, 3, 4}, out.data()); + std::ranges::move(std::array{1, 2, 3, 4}, out.data()); assert(ret.out == out.data() + 4); assert((out == std::array{1, 2, 3, 4})); } { // check that an iterator is returned with a borrowing range - std::array in {1, 2, 3, 4}; + std::array in{1, 2, 3, 4}; std::array out; std::same_as::iterator, int*>> decltype(auto) ret = std::ranges::move(std::views::all(in), out.data()); @@ -284,8 +314,8 @@ constexpr bool test() { { // check that every element is moved exactly once struct MoveOnce { - bool moved = false; - constexpr MoveOnce() = default; + bool moved = false; + constexpr MoveOnce() = default; constexpr MoveOnce(const MoveOnce& other) = delete; constexpr MoveOnce& operator=(MoveOnce&& other) { assert(!other.moved); @@ -294,16 +324,16 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.moved; })); } { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move(in, out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); @@ -314,8 +344,8 @@ constexpr bool test() { { // check that the range is moved forwards struct OnlyForwardsMovable { OnlyForwardsMovable* next = nullptr; - bool canMove = false; - OnlyForwardsMovable() = default; + bool canMove = false; + OnlyForwardsMovable() = default; constexpr OnlyForwardsMovable& operator=(OnlyForwardsMovable&&) { assert(canMove); if (next != nullptr) @@ -324,12 +354,12 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; - out[0].next = &out[1]; - out[1].next = &out[2]; + std::array in{}; + std::array out{}; + out[0].next = &out[1]; + out[1].next = &out[2]; out[0].canMove = true; - auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); + auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(out[0].canMove); @@ -337,12 +367,12 @@ constexpr bool test() { assert(out[2].canMove); } { - std::array in {}; - std::array out {}; - out[0].next = &out[1]; - out[1].next = &out[2]; + std::array in{}; + std::array out{}; + out[0].next = &out[1]; + out[1].next = &out[2]; out[0].canMove = true; - auto ret = std::ranges::move(in, out.begin()); + auto ret = std::ranges::move(in, out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(out[0].canMove); @@ -358,19 +388,31 @@ constexpr bool test() { auto ret = std::ranges::move(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4), b.data()); assert(ret.in == a + 4); assert(ret.out == b.data() + 4); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } { int a[] = {1, 2, 3, 4}; std::array b; auto range = std::ranges::subrange(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4)); - auto ret = std::ranges::move(range, b.data()); + auto ret = std::ranges::move(range, b.data()); assert(ret.in == a + 4); assert(ret.out == b.data() + 4); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } } +#if TEST_STD_VER >= 23 + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } +#endif + return true; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp index 47cf178636ad1..923b4c790dd1d 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp @@ -31,6 +31,7 @@ #include "almost_satisfies_types.h" #include "MoveOnly.h" #include "test_iterators.h" +#include "test_macros.h" template > concept HasMoveBackwardIt = requires(In in, Sent sent, Out out) { std::ranges::move_backward(in, sent, out); }; @@ -65,7 +66,7 @@ constexpr void test(std::array in) { { std::array out; std::same_as> decltype(auto) ret = - std::ranges::move_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size())); + std::ranges::move_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data()); @@ -92,16 +93,16 @@ constexpr void test_iterators() { template constexpr void test_containers() { { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); std::same_as> auto ret = - std::ranges::move_backward(In(in.begin()), Sent(In(in.end())), Out(out.end())); + std::ranges::move_backward(In(in.begin()), Sent(In(in.end())), Out(out.end())); assert(std::ranges::equal(in, out)); assert(base(ret.in) == in.end()); assert(base(ret.out) == out.begin()); } { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end()))); std::same_as> auto ret = std::ranges::move_backward(range, Out(out.end())); @@ -159,25 +160,62 @@ constexpr void test_proxy_in_iterators() { } struct IteratorWithMoveIter { - using value_type = int; - using difference_type = int; + using value_type = int; + using difference_type = int; explicit IteratorWithMoveIter() = default; int* ptr; constexpr IteratorWithMoveIter(int* ptr_) : ptr(ptr_) {} constexpr int& operator*() const; // iterator with iter_move should not be dereferenced - constexpr IteratorWithMoveIter& operator++() { ++ptr; return *this; } - constexpr IteratorWithMoveIter operator++(int) { auto ret = *this; ++*this; return ret; } + constexpr IteratorWithMoveIter& operator++() { + ++ptr; + return *this; + } + constexpr IteratorWithMoveIter operator++(int) { + auto ret = *this; + ++*this; + return ret; + } - constexpr IteratorWithMoveIter& operator--() { --ptr; return *this; } - constexpr IteratorWithMoveIter operator--(int) { auto ret = *this; --*this; return ret; } + constexpr IteratorWithMoveIter& operator--() { + --ptr; + return *this; + } + constexpr IteratorWithMoveIter operator--(int) { + auto ret = *this; + --*this; + return ret; + } friend constexpr int iter_move(const IteratorWithMoveIter&) { return 42; } constexpr bool operator==(const IteratorWithMoveIter& other) const = default; }; +#if TEST_STD_VER >= 23 +constexpr bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move_backward with aligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move_backward(in, out.end()); + assert(out == v); + } + { // Test move_backward with unaligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move_backward(std::views::counted(in.begin(), N - 4), out.end()); + assert(std::ranges::equal(v | std::views::take(N - 4), out | std::views::drop(4))); + } + + return true; +} +#endif + constexpr bool test() { test_in_iterators(); test_in_iterators(); @@ -243,7 +281,8 @@ constexpr bool test() { MoveOnly b[3]; ProxyRange proxyA{a}; ProxyRange proxyB{b}; - std::ranges::move_backward(std::begin(proxyA), std::end(proxyA), std::ranges::next(proxyB.begin(), std::end(proxyB))); + std::ranges::move_backward( + std::begin(proxyA), std::end(proxyA), std::ranges::next(proxyB.begin(), std::end(proxyB))); assert(b[0].get() == 1); assert(b[1].get() == 2); assert(b[2].get() == 3); @@ -253,13 +292,13 @@ constexpr bool test() { { // check that ranges::dangling is returned std::array out; std::same_as> auto ret = - std::ranges::move_backward(std::array {1, 2, 3, 4}, out.data() + out.size()); + std::ranges::move_backward(std::array{1, 2, 3, 4}, out.data() + out.size()); assert(ret.out == out.data()); assert((out == std::array{1, 2, 3, 4})); } { // check that an iterator is returned with a borrowing range - std::array in {1, 2, 3, 4}; + std::array in{1, 2, 3, 4}; std::array out; std::same_as::iterator, int*>> auto ret = std::ranges::move_backward(std::views::all(in), out.data() + out.size()); @@ -270,8 +309,8 @@ constexpr bool test() { { // check that every element is moved exactly once struct MoveOnce { - bool moved = false; - constexpr MoveOnce() = default; + bool moved = false; + constexpr MoveOnce() = default; constexpr MoveOnce(const MoveOnce& other) = delete; constexpr MoveOnce& operator=(const MoveOnce& other) { assert(!other.moved); @@ -280,16 +319,16 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.moved; })); } { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move_backward(in, out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); @@ -300,8 +339,8 @@ constexpr bool test() { { // check that the range is moved backwards struct OnlyBackwardsMovable { OnlyBackwardsMovable* next = nullptr; - bool canMove = false; - OnlyBackwardsMovable() = default; + bool canMove = false; + OnlyBackwardsMovable() = default; constexpr OnlyBackwardsMovable& operator=(const OnlyBackwardsMovable&) { assert(canMove); if (next != nullptr) @@ -310,12 +349,12 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; - out[1].next = &out[0]; - out[2].next = &out[1]; + std::array in{}; + std::array out{}; + out[1].next = &out[0]; + out[2].next = &out[1]; out[2].canMove = true; - auto ret = std::ranges::move_backward(in, out.end()); + auto ret = std::ranges::move_backward(in, out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(out[0].canMove); @@ -323,12 +362,12 @@ constexpr bool test() { assert(out[2].canMove); } { - std::array in {}; - std::array out {}; - out[1].next = &out[0]; - out[2].next = &out[1]; + std::array in{}; + std::array out{}; + out[1].next = &out[0]; + out[2].next = &out[1]; out[2].canMove = true; - auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); + auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(out[0].canMove); @@ -344,19 +383,31 @@ constexpr bool test() { auto ret = std::ranges::move_backward(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4), b.data() + b.size()); assert(ret.in == a + 4); assert(ret.out == b.data()); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } { int a[] = {1, 2, 3, 4}; std::array b; auto range = std::ranges::subrange(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4)); - auto ret = std::ranges::move_backward(range, b.data() + b.size()); + auto ret = std::ranges::move_backward(range, b.data() + b.size()); assert(ret.in == a + 4); assert(ret.out == b.data()); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } } +#if TEST_STD_VER >= 23 + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } +#endif + return true; } From 210036a22eefa2e33d1a76a62d4ec6f5bc66a92b Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 19 Feb 2025 11:37:24 -0500 Subject: [PATCH 081/220] [AMDGPU][True16][CodeGen] true16 codegen pattern for fma (#127240) Previous PR https://github.com/llvm/llvm-project/pull/122950 get reverted since it hit the buildbot failure. Another patch get merged when this PR is under review, and thus causing one test not up to date. repen this PR and fixed the issue. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 125 +++++-- llvm/lib/Target/AMDGPU/SIInstructions.td | 8 + .../Target/AMDGPU/SIShrinkInstructions.cpp | 17 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 68 ++-- .../CodeGen/AMDGPU/fix-sgpr-copies-f16.mir | 3 +- llvm/test/CodeGen/AMDGPU/fma.f16.ll | 328 +++++++++++++----- llvm/test/CodeGen/AMDGPU/preserve-hi16.ll | 3 +- .../CodeGen/AMDGPU/shrink-mad-fma-fake16.mir | 242 +++++++++++++ .../CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir | 258 ++++++++++++++ llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir | 115 +----- 11 files changed, 913 insertions(+), 256 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index ab396929162d0..fa15e73bc31d5 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -203,6 +203,8 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::V_FMA_F32_e64; case AMDGPU::V_FMAC_F16_e64: return AMDGPU::V_FMA_F16_gfx9_e64; + case AMDGPU::V_FMAC_F16_t16_e64: + return AMDGPU::V_FMA_F16_gfx9_t16_e64; case AMDGPU::V_FMAC_F16_fake16_e64: return AMDGPU::V_FMA_F16_gfx9_fake16_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7dace11d208a0..2691a4135b6f2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3461,6 +3461,62 @@ std::optional SIInstrInfo::extractSubregFromImm(int64_t Imm, llvm_unreachable("covered subregister switch"); } +static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAD_F16_e64: + return AMDGPU::V_MADAK_F16; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAD_F32_e64: + return AMDGPU::V_MADAK_F32; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMA_F32_e64: + return AMDGPU::V_FMAAK_F32; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMA_F16_e64: + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMAAK_F16_t16 + : AMDGPU::V_FMAAK_F16_fake16 + : AMDGPU::V_FMAAK_F16; + default: + llvm_unreachable("invalid instruction"); + } +} + +static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAD_F16_e64: + return AMDGPU::V_MADMK_F16; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAD_F32_e64: + return AMDGPU::V_MADMK_F32; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMA_F32_e64: + return AMDGPU::V_FMAMK_F32; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMA_F16_e64: + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMAMK_F16_t16 + : AMDGPU::V_FMAMK_F16_fake16 + : AMDGPU::V_FMAMK_F16; + default: + llvm_unreachable("invalid instruction"); + } +} + bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) @@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. @@ -3555,6 +3612,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -3586,18 +3644,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, !isInlineConstant(Def->getOperand(1))) return false; - unsigned NewOpc = - IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16) - : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); + unsigned NewOpc = getNewFMAMKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite - // would also require restricting their register classes. For now - // just bail out. - if (NewOpc == AMDGPU::V_FMAMK_F16_fake16) + // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16 + // takes VGPR_32_Lo128 operands, so the rewrite would also require + // restricting their register classes. For now just bail out. + if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || + NewOpc == AMDGPU::V_FMAMK_F16_fake16) return false; const std::optional SubRegImm = extractSubregFromImm( @@ -3613,7 +3668,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setIsKill(RegSrc->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3666,25 +3721,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } } - unsigned NewOpc = - IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16) - : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); + unsigned NewOpc = getNewFMAAKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite - // would also require restricting their register classes. For now - // just bail out. - if (NewOpc == AMDGPU::V_FMAAK_F16_fake16) + // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16 + // takes VGPR_32_Lo128 operands, so the rewrite would also require + // restricting their register classes. For now just bail out. + if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || + NewOpc == AMDGPU::V_FMAAK_F16_fake16) return false; // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3874,8 +3926,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { return AMDGPU::V_FMA_LEGACY_F32_e64; case AMDGPU::V_FMAC_F16_e32: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: - return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64 + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMA_F16_gfx9_t16_e64 + : AMDGPU::V_FMA_F16_gfx9_fake16_e64 : AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_F32_e32: case AMDGPU::V_FMAC_F32_e64: @@ -3941,19 +3996,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } - assert( - Opc != AMDGPU::V_FMAC_F16_fake16_e32 && - "V_FMAC_F16_fake16_e32 is not supported and not expected to be present " - "pre-RA"); + assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 && + Opc != AMDGPU::V_FMAC_F16_fake16_e32 && + "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be " + "present pre-RA"); // Handle MAC/FMAC. bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64; bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; @@ -3968,6 +4025,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return nullptr; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: @@ -4052,11 +4110,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { - unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16) - : AMDGPU::V_FMAAK_F32) - : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); + unsigned NewOpc = getNewFMAAKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -4071,11 +4125,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } } - unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16) - : AMDGPU::V_FMAMK_F32) - : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); + unsigned NewOpc = getNewFMAMKInst(ST, Opc); if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) @@ -4513,6 +4563,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_FMAC_F32_e64: case AMDGPU::V_FMAC_F64_e64: @@ -5569,7 +5620,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; - case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64; + case AMDGPU::S_FMAC_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64 + : AMDGPU::V_FMAC_F16_fake16_e64; case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6e08aff24ec23..3faf0795157dc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3287,6 +3287,14 @@ def : GCNPat < (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; +let True16Predicate = UseRealTrue16Insts in +def : GCNPat < + (fma (f16 (VOP3NoMods f16:$src0)), + (f16 (VOP3NoMods f16:$src1)), + (f16 (VOP3NoMods f16:$src2))), + (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2) +>; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (fma (f16 (VOP3NoMods f16:$src0)), diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 979812e07fc3f..f03cde455f295 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + NewOpcode = AMDGPU::V_FMAAK_F16; + break; + case AMDGPU::V_FMA_F16_gfx9_t16_e64: + NewOpcode = AMDGPU::V_FMAAK_F16_t16; + break; case AMDGPU::V_FMA_F16_gfx9_fake16_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16; + NewOpcode = AMDGPU::V_FMAAK_F16_fake16; break; } } @@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + NewOpcode = AMDGPU::V_FMAMK_F16; + break; + case AMDGPU::V_FMA_F16_gfx9_t16_e64: + NewOpcode = AMDGPU::V_FMAMK_F16_t16; + break; case AMDGPU::V_FMA_F16_gfx9_fake16_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16; + NewOpcode = AMDGPU::V_FMAMK_F16_fake16; break; } } @@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { shrinkMadFma(MI); continue; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 99e6c5d06a0e1..0b09cabf25a16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @v_fma_f32(float %x, float %y, float %z) { ; GFX6-LABEL: v_fma_f32: @@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %fma } @@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_lhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z) ret half %fma @@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_rhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg half %y %fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z) ret half %fma @@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_add: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg half %z %fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z) ret half %fma diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index ac7944f25fe37..23e4b80b61f69 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s +# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow +# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 52a23690dcf53..a33fd03e0ce03 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -3,8 +3,10 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL @@ -24,11 +26,34 @@ define half @test_fma(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fma: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fma: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fma: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fma: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fma: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fma: ; GFX12: ; %bb.0: @@ -57,11 +82,31 @@ define half @test_fmac(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmac: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmac: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmac: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmac: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmac: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmac: ; GFX12: ; %bb.0: @@ -98,11 +143,31 @@ define half @test_fmaak(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmaak: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmaak: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmaak: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmaak: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmaak: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmaak: ; GFX12: ; %bb.0: @@ -139,11 +204,33 @@ define half @test_fmamk(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmamk: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmamk: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmamk: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmamk: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmamk: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmamk: ; GFX12: ; %bb.0: @@ -208,33 +295,61 @@ define i32 @test_D139469_f16(half %arg) { ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_D139469_f16: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_D139469_f16: -; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 -; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e +; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_D139469_f16: ; GFX12-SDAG: ; %bb.0: ; %bb @@ -347,44 +462,83 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_D139469_v2f16: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e -; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_D139469_v2f16: -; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 -; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_D139469_v2f16: ; GFX12-SDAG: ; %bb.0: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 0ad1c30b5b5a4..1f36101c7b53a 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -814,7 +814,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir new file mode 100644 index 0000000000000..d551ad88f56b7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir @@ -0,0 +1,242 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 + +--- +name: mad_cvv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_cvv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vcv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vvc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vsc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_cvv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vcv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vvc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vsc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_cvv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_cvv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vcv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vvc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vsc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_cvv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vcv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vvc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vsc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir new file mode 100644 index 0000000000000..89ef5df9beb8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir @@ -0,0 +1,258 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10 + +--- +name: mad_cvv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_cvv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vcv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vvc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vsc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_cvv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vcv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vvc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vsc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_cvv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_cvv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vcv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vvc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vsc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_cvv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vcv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vvc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vsc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir index 26feb8120c751..c9138dda7d1a7 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir @@ -1,17 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 --- name: mad_cvv_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_cvv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_cvv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -27,12 +20,6 @@ body: | name: mad_vcv_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vcv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vcv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -48,12 +35,6 @@ body: | name: mad_vvc_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vvc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vvc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -69,12 +50,6 @@ body: | name: mad_vsc_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vsc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vsc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -90,12 +65,6 @@ body: | name: fma_cvv_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_cvv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_cvv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -111,12 +80,6 @@ body: | name: fma_vcv_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vcv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vcv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -132,12 +95,6 @@ body: | name: fma_vvc_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vvc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vvc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -153,12 +110,6 @@ body: | name: fma_vsc_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vsc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vsc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -174,12 +125,6 @@ body: | name: mad_cvv_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_cvv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -195,12 +140,6 @@ body: | name: mad_vcv_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vcv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -216,12 +155,6 @@ body: | name: mad_vvc_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vvc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -237,12 +170,6 @@ body: | name: mad_vsc_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vsc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -258,20 +185,14 @@ body: | name: fma_cvv_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_cvv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, 18688, 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -279,20 +200,14 @@ body: | name: fma_vcv_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vcv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, 18688, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -300,20 +215,14 @@ body: | name: fma_vvc_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vvc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_lo16, $vgpr1_lo16, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -321,19 +230,13 @@ body: | name: fma_vsc_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vsc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_hi16, $vgpr1_hi16, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $sgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_hi16, 0, $vgpr1_hi16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... From e7bf54d62771219145171c66584578972edf0e30 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 19 Feb 2025 11:38:28 -0500 Subject: [PATCH 082/220] [flang] AliasAnalysis: Handle fir.load on hlfir.designate (#127107) For example, determine that the address in `obj%p` below cannot alias the address of `v`: ``` module m type :: ty real, pointer :: p end type ty end module m subroutine test() use m real, target :: t real :: v type(ty) :: obj obj%p => t v = obj%p end subroutine test ``` --- .../flang/Optimizer/Analysis/AliasAnalysis.h | 8 +- .../lib/Optimizer/Analysis/AliasAnalysis.cpp | 18 +- .../AliasAnalysis/load-ptr-designate.fir | 511 ++++++++++++++++++ 3 files changed, 531 insertions(+), 6 deletions(-) create mode 100644 flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir diff --git a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h index 8d17e4e476d10..c71988d081dd0 100644 --- a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h +++ b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h @@ -211,14 +211,14 @@ struct AliasAnalysis { fir::AliasAnalysis::Source getSource(mlir::Value, bool getLastInstantiationPoint = false); + /// Return true, if `ty` is a reference type to a boxed + /// POINTER object or a raw fir::PointerType. + static bool isPointerReference(mlir::Type ty); + private: /// Return true, if `ty` is a reference type to an object of derived type /// that contains a component with POINTER attribute. static bool isRecordWithPointerComponent(mlir::Type ty); - - /// Return true, if `ty` is a reference type to a boxed - /// POINTER object or a raw fir::PointerType. - static bool isPointerReference(mlir::Type ty); }; inline bool operator==(const AliasAnalysis::Source::SourceOrigin &lhs, diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 873758487ddd0..70fa18ad65b9b 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -54,10 +54,11 @@ static bool hasGlobalOpTargetAttr(mlir::Value v, fir::AddrOfOp op) { static mlir::Value getOriginalDef(mlir::Value v, fir::AliasAnalysis::Source::Attributes &attributes, - bool &isCapturedInInternalProcedure) { + bool &isCapturedInInternalProcedure, bool &approximateSource) { mlir::Operation *defOp; bool breakFromLoop = false; while (!breakFromLoop && (defOp = v.getDefiningOp())) { + mlir::Type ty = defOp->getResultTypes()[0]; llvm::TypeSwitch(defOp) .Case([&](fir::ConvertOp op) { v = op.getValue(); }) .Case([&](auto op) { @@ -67,6 +68,18 @@ getOriginalDef(mlir::Value v, isCapturedInInternalProcedure |= varIf.isCapturedInInternalProcedure(); }) + .Case([&](auto op) { + if (fir::AliasAnalysis::isPointerReference(ty)) + attributes.set(fir::AliasAnalysis::Attribute::Pointer); + v = op->getOperand(0); + approximateSource = true; + }) + .Case([&](hlfir::DesignateOp op) { + auto varIf = llvm::cast(defOp); + attributes |= getAttrsFromVariable(varIf); + v = op.getMemref(); + approximateSource = true; + }) .Default([&](auto op) { breakFromLoop = true; }); } return v; @@ -609,7 +622,8 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, attributes.set(Attribute::Pointer); auto def = getOriginalDef(op.getMemref(), attributes, - isCapturedInInternalProcedure); + isCapturedInInternalProcedure, + approximateSource); if (auto addrOfOp = def.template getDefiningOp()) { global = addrOfOp.getSymbol(); diff --git a/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir b/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir new file mode 100644 index 0000000000000..de81841d9249d --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir @@ -0,0 +1,511 @@ +// Check aliasing with the address *in* (not *of*) a pointer component +// (hlfir.designate). +// +// Throughout this test, the ".fir" suffix on symbols indicates a version of the +// MLIR after convert-hlfir-to-fir. A key difference is that component access +// is via fir.coordinate_of instead of hlfir.designate. We would like alias +// analysis results to be the same in both versions. + +// RUN: fir-opt %s -split-input-file -o /dev/null --mlir-disable-threading \ +// RUN: -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' \ +// RUN: 2>&1 | FileCheck -match-full-lines %s + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// end module m +// subroutine test() +// use m +// real, target :: t +// real :: v +// type(ty) :: obj +// type(ty), target :: t_obj +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: NoAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: NoAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest() { + %0 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "obj", uniq_name = "_QFtestEobj"} + %1:2 = hlfir.declare %0 {test.ptr="obj", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "t_obj", fir.target, uniq_name = "_QFtestEt_obj"} + %5:2 = hlfir.declare %4 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7:2 = hlfir.declare %6 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %8 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %11 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %12 = fir.load %11 : !fir.ref>> + %13 = fir.box_addr %12 {test.ptr="obj%p1.tgt"}: (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %14 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %15 = hlfir.designate %1#0{"arr"} <%14> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %16 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %17 = fir.load %16 : !fir.ref>> + %repeat17 = fir.load %16 : !fir.ref>> + %18 = fir.box_addr %17 {test.ptr="obj%alloc.tgt"}: (!fir.box>) -> !fir.heap + %repeat18 = fir.box_addr %repeat17 {test.ptr="obj%alloc.tgt2"}: (!fir.box>) -> !fir.heap + %c2_1 = arith.constant 2 : index + %19 = fir.shape %c2_1 : (index) -> !fir.shape<1> + %c1_2 = arith.constant 1 : index + %20 = hlfir.designate %5#0{"arr"} <%19> (%c1_2) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %21 = hlfir.designate %5#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir() { + %0 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "obj", uniq_name = "_QFtestEobj"} + %1 = fir.declare %0 {test.ptr="obj.fir", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "t_obj", fir.target, uniq_name = "_QFtestEt_obj"} + %5 = fir.declare %4 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7 = fir.declare %6 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %8 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %9 = fir.coordinate_of %1, %8 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %10 = fir.load %9 : !fir.ref>> + %11 = fir.box_addr %10 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %12 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %13 = fir.coordinate_of %1, %12 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %14 = fir.load %13 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %18 = fir.coordinate_of %1, %17 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %19 = fir.array_coor %18(%16) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %20 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %21 = fir.coordinate_of %1, %20 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %repeat22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat23 = fir.box_addr %repeat22 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %24 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %25 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %26 = fir.coordinate_of %5, %25 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %27 = fir.array_coor %26(%24) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %28 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %29 = fir.coordinate_of %5, %28 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %30 = fir.load %29 : !fir.ref>> + %31 = fir.box_addr %30 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} + +// ----- + +// Repeat above test except composites are dummy args instead of locals. + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// end module m +// subroutine test(obj, t_obj) +// use m +// type(ty) :: obj +// type(ty), target :: t_obj +// real, target :: t +// real :: v +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// TODO: Thus, we expect all cases below to be NoAlias. However, target dummy +// args are currently indiscrimnately analyzed as MayAlias. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest(%arg0: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "obj"}, %arg1: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "t_obj", fir.target}) { + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {test.ptr="obj", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4:2 = hlfir.declare %arg1 dummy_scope %0 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %5 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %6:2 = hlfir.declare %5 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %7 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %8 = fir.load %7 : !fir.ref>> + %9 = fir.box_addr %8 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %10 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %11 = fir.load %10 : !fir.ref>> + %12 = fir.box_addr %11 {test.ptr="obj%p1.tgt"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %13 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %14 = hlfir.designate %1#0{"arr"} <%13> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %15 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %16 = fir.load %15 : !fir.ref>> + %repeat16 = fir.load %15 : !fir.ref>> + %17 = fir.box_addr %16 {test.ptr="obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + %repeat17 = fir.box_addr %repeat16 {test.ptr="obj%alloc.tgt2"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %18 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %19 = hlfir.designate %4#0{"arr"} <%18> (%c1_1) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %20 = hlfir.designate %4#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %21 = fir.load %20 : !fir.ref>> + %22 = fir.box_addr %21 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir(%arg0: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "obj"}, %arg1: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "t_obj", fir.target}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg0 dummy_scope %0 {test.ptr="obj.fir", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.declare %arg1 dummy_scope %0 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %6 = fir.declare %5 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %7 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %8 = fir.coordinate_of %1, %7 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %11 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %12 = fir.coordinate_of %1, %11 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %13 = fir.load %12 : !fir.ref>> + %14 = fir.box_addr %13 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %15 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %16 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %17 = fir.coordinate_of %1, %16 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %18 = fir.array_coor %17(%15) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %19 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %20 = fir.coordinate_of %1, %19 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %21 = fir.load %20 : !fir.ref>> + %repeat21 = fir.load %20 : !fir.ref>> + %22 = fir.box_addr %21 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat22 = fir.box_addr %repeat21 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %23 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %24 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %25 = fir.coordinate_of %4, %24 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %26 = fir.array_coor %25(%23) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %27 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %28 = fir.coordinate_of %4, %27 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %29 = fir.load %28 : !fir.ref>> + %30 = fir.box_addr %29 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} + +// ----- + +// Repeat above test except composites are globals. + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// type(ty) :: obj +// type(ty), target :: t_obj +// end module m +// subroutine test() +// use m +// real, target :: t +// real :: v +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: NoAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: NoAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest() { + %0 = fir.address_of(@_QMmEobj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %1:2 = hlfir.declare %0 {test.ptr="obj", uniq_name = "_QMmEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4 = fir.address_of(@_QMmEt_obj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5:2 = hlfir.declare %4 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QMmEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7:2 = hlfir.declare %6 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %8 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %12 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %13 = fir.load %12 : !fir.ref>> + %14 = fir.box_addr %13 {test.ptr="obj%p1.tgt"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = hlfir.designate %1#0{"arr"} <%16> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %19 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %20 = fir.load %19 : !fir.ref>> + %repeat20 = fir.load %19 : !fir.ref>> + %21 = fir.box_addr %20 {test.ptr="obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + %repeat21 = fir.box_addr %repeat20 {test.ptr="obj%alloc.tgt2"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %23 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %24 = hlfir.designate %5#0{"arr"} <%23> (%c1_1) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %26 = hlfir.designate %5#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %27 = fir.load %26 : !fir.ref>> + %28 = fir.box_addr %27 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir() { + %0 = fir.address_of(@_QMmEobj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %1 = fir.declare %0 {test.ptr="obj.fir", uniq_name = "_QMmEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.address_of(@_QMmEt_obj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5 = fir.declare %4 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QMmEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7 = fir.declare %6 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %8 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %9 = fir.coordinate_of %1, %8 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %10 = fir.load %9 : !fir.ref>> + %11 = fir.box_addr %10 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %12 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %13 = fir.coordinate_of %1, %12 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %14 = fir.load %13 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %18 = fir.coordinate_of %1, %17 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %19 = fir.array_coor %18(%16) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %20 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %21 = fir.coordinate_of %1, %20 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %repeat22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat23 = fir.box_addr %repeat22 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %24 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %25 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %26 = fir.coordinate_of %5, %25 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %27 = fir.array_coor %26(%24) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %28 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %29 = fir.coordinate_of %5, %28 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %30 = fir.load %29 : !fir.ref>> + %31 = fir.box_addr %30 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} From fda0e63e7331debacf9f36a64dad76339ad86482 Mon Sep 17 00:00:00 2001 From: Jakub Ficek Date: Wed, 19 Feb 2025 18:03:18 +0100 Subject: [PATCH 083/220] [clang] handle fp options in __builtin_convertvector (#125522) This patch allows using fpfeatures pragmas with __builtin_convertvector: - added TrailingObjects with FPOptionsOverride and methods for handling it to ConvertVectorExpr - added support for codegen, node dumping, and serialization of fpfeatures contained in ConvertVectorExpr --- clang/include/clang/AST/Expr.h | 80 +++++++++++++++++++++-- clang/include/clang/AST/Stmt.h | 15 +++++ clang/include/clang/AST/TextNodeDumper.h | 1 + clang/lib/AST/ASTImporter.cpp | 7 +- clang/lib/AST/Expr.cpp | 20 ++++++ clang/lib/AST/TextNodeDumper.cpp | 6 ++ clang/lib/CodeGen/CGExprScalar.cpp | 2 + clang/lib/Sema/SemaChecking.cpp | 4 +- clang/lib/Serialization/ASTReaderStmt.cpp | 13 +++- clang/lib/Serialization/ASTWriterStmt.cpp | 4 ++ clang/test/AST/ast-dump-fpfeatures.cpp | 23 ++++++- clang/test/AST/const-fpfeatures.c | 11 ++++ clang/test/CodeGen/pragma-fenv_access.c | 9 +++ 13 files changed, 183 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index ff4f236c1fa88..0f98d237dcbcd 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -4579,25 +4579,97 @@ class ShuffleVectorExpr : public Expr { /// ConvertVectorExpr - Clang builtin function __builtin_convertvector /// This AST node provides support for converting a vector type to another /// vector type of the same arity. -class ConvertVectorExpr : public Expr { +class ConvertVectorExpr final + : public Expr, + private llvm::TrailingObjects { private: Stmt *SrcExpr; TypeSourceInfo *TInfo; SourceLocation BuiltinLoc, RParenLoc; + friend TrailingObjects; friend class ASTReader; friend class ASTStmtReader; - explicit ConvertVectorExpr(EmptyShell Empty) : Expr(ConvertVectorExprClass, Empty) {} + explicit ConvertVectorExpr(bool HasFPFeatures, EmptyShell Empty) + : Expr(ConvertVectorExprClass, Empty) { + ConvertVectorExprBits.HasFPFeatures = HasFPFeatures; + } -public: ConvertVectorExpr(Expr *SrcExpr, TypeSourceInfo *TI, QualType DstType, ExprValueKind VK, ExprObjectKind OK, - SourceLocation BuiltinLoc, SourceLocation RParenLoc) + SourceLocation BuiltinLoc, SourceLocation RParenLoc, + FPOptionsOverride FPFeatures) : Expr(ConvertVectorExprClass, DstType, VK, OK), SrcExpr(SrcExpr), TInfo(TI), BuiltinLoc(BuiltinLoc), RParenLoc(RParenLoc) { + ConvertVectorExprBits.HasFPFeatures = FPFeatures.requiresTrailingStorage(); + if (hasStoredFPFeatures()) + setStoredFPFeatures(FPFeatures); setDependence(computeDependence(this)); } + size_t numTrailingObjects(OverloadToken) const { + return ConvertVectorExprBits.HasFPFeatures ? 1 : 0; + } + + FPOptionsOverride &getTrailingFPFeatures() { + assert(ConvertVectorExprBits.HasFPFeatures); + return *getTrailingObjects(); + } + + const FPOptionsOverride &getTrailingFPFeatures() const { + assert(ConvertVectorExprBits.HasFPFeatures); + return *getTrailingObjects(); + } + +public: + static ConvertVectorExpr *CreateEmpty(const ASTContext &C, + bool hasFPFeatures); + + static ConvertVectorExpr *Create(const ASTContext &C, Expr *SrcExpr, + TypeSourceInfo *TI, QualType DstType, + ExprValueKind VK, ExprObjectKind OK, + SourceLocation BuiltinLoc, + SourceLocation RParenLoc, + FPOptionsOverride FPFeatures); + + /// Get the FP contractibility status of this operator. Only meaningful for + /// operations on floating point types. + bool isFPContractableWithinStatement(const LangOptions &LO) const { + return getFPFeaturesInEffect(LO).allowFPContractWithinStatement(); + } + + /// Is FPFeatures in Trailing Storage? + bool hasStoredFPFeatures() const { + return ConvertVectorExprBits.HasFPFeatures; + } + + /// Get FPFeatures from trailing storage. + FPOptionsOverride getStoredFPFeatures() const { + return getTrailingFPFeatures(); + } + + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + + /// Set FPFeatures in trailing storage, used by Serialization & ASTImporter. + void setStoredFPFeatures(FPOptionsOverride F) { getTrailingFPFeatures() = F; } + + /// Get the FP features status of this operator. Only meaningful for + /// operations on floating point types. + FPOptions getFPFeaturesInEffect(const LangOptions &LO) const { + if (ConvertVectorExprBits.HasFPFeatures) + return getStoredFPFeatures().applyOverrides(LO); + return FPOptions::defaultWithoutTrailingStorage(LO); + } + + FPOptionsOverride getFPOptionsOverride() const { + if (ConvertVectorExprBits.HasFPFeatures) + return getStoredFPFeatures(); + return FPOptionsOverride(); + } + /// getSrcExpr - Return the Expr to be converted. Expr *getSrcExpr() const { return cast(SrcExpr); } diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index 405c6166adb15..604ac51d478cf 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -1215,6 +1215,20 @@ class alignas(void *) Stmt { SourceLocation Loc; }; + class ConvertVectorExprBitfields { + friend class ConvertVectorExpr; + + LLVM_PREFERRED_TYPE(ExprBitfields) + unsigned : NumExprBits; + + // + /// This is only meaningful for operations on floating point + /// types when additional values need to be in trailing storage. + /// It is 0 otherwise. + LLVM_PREFERRED_TYPE(bool) + unsigned HasFPFeatures : 1; + }; + union { // Same order as in StmtNodes.td. // Statements @@ -1293,6 +1307,7 @@ class alignas(void *) Stmt { // Clang Extensions OpaqueValueExprBitfields OpaqueValueExprBits; + ConvertVectorExprBitfields ConvertVectorExprBits; }; public: diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h index 4b5ad2b5fa74c..81844db2c77fa 100644 --- a/clang/include/clang/AST/TextNodeDumper.h +++ b/clang/include/clang/AST/TextNodeDumper.h @@ -425,6 +425,7 @@ class TextNodeDumper void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S); void VisitEmbedExpr(const EmbedExpr *S); void VisitAtomicExpr(const AtomicExpr *AE); + void VisitConvertVectorExpr(const ConvertVectorExpr *S); }; } // namespace clang diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index c27ebbf838ad1..43da76e14d0a3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -7386,9 +7386,10 @@ ExpectedStmt ASTNodeImporter::VisitConvertVectorExpr(ConvertVectorExpr *E) { if (Err) return std::move(Err); - return new (Importer.getToContext()) - ConvertVectorExpr(ToSrcExpr, ToTSI, ToType, E->getValueKind(), - E->getObjectKind(), ToBuiltinLoc, ToRParenLoc); + return ConvertVectorExpr::Create( + Importer.getToContext(), ToSrcExpr, ToTSI, ToType, E->getValueKind(), + E->getObjectKind(), ToBuiltinLoc, ToRParenLoc, + E->getStoredFPFeaturesOrDefault()); } ExpectedStmt ASTNodeImporter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 1f949d495f343..b747aa8df807d 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -3911,6 +3911,8 @@ FPOptions Expr::getFPFeaturesInEffect(const LangOptions &LO) const { return BO->getFPFeaturesInEffect(LO); if (auto Cast = dyn_cast(this)) return Cast->getFPFeaturesInEffect(LO); + if (auto ConvertVector = dyn_cast(this)) + return ConvertVector->getFPFeaturesInEffect(LO); return FPOptions::defaultWithoutTrailingStorage(LO); } @@ -5451,3 +5453,21 @@ OpenACCAsteriskSizeExpr * OpenACCAsteriskSizeExpr::CreateEmpty(const ASTContext &C) { return new (C) OpenACCAsteriskSizeExpr({}, C.IntTy); } + +ConvertVectorExpr *ConvertVectorExpr::CreateEmpty(const ASTContext &C, + bool hasFPFeatures) { + void *Mem = C.Allocate(totalSizeToAlloc(hasFPFeatures), + alignof(ConvertVectorExpr)); + return new (Mem) ConvertVectorExpr(hasFPFeatures, EmptyShell()); +} + +ConvertVectorExpr *ConvertVectorExpr::Create( + const ASTContext &C, Expr *SrcExpr, TypeSourceInfo *TI, QualType DstType, + ExprValueKind VK, ExprObjectKind OK, SourceLocation BuiltinLoc, + SourceLocation RParenLoc, FPOptionsOverride FPFeatures) { + bool HasFPFeatures = FPFeatures.requiresTrailingStorage(); + unsigned Size = totalSizeToAlloc(HasFPFeatures); + void *Mem = C.Allocate(Size, alignof(ConvertVectorExpr)); + return new (Mem) ConvertVectorExpr(SrcExpr, TI, DstType, VK, OK, BuiltinLoc, + RParenLoc, FPFeatures); +} diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 26493caa5d06a..fd1eaab9621dd 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -3069,3 +3069,9 @@ void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) { void TextNodeDumper::VisitAtomicExpr(const AtomicExpr *AE) { OS << ' ' << AE->getOpAsString(); } + +void TextNodeDumper::VisitConvertVectorExpr(const ConvertVectorExpr *S) { + VisitStmt(S); + if (S->hasStoredFPFeatures()) + printFPOptions(S->getStoredFPFeatures()); +} diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 30f01496ba221..5ee8a1bfa8175 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1949,6 +1949,7 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { llvm::Value *Zero = llvm::Constant::getNullValue(SrcTy); if (SrcEltTy->isFloatingPointTy()) { + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); return Builder.CreateFCmpUNE(Src, Zero, "tobool"); } else { return Builder.CreateICmpNE(Src, Zero, "tobool"); @@ -1975,6 +1976,7 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { } else { assert(SrcEltTy->isFloatingPointTy() && DstEltTy->isFloatingPointTy() && "Unknown real conversion"); + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); if (DstEltTy->getTypeID() < SrcEltTy->getTypeID()) Res = Builder.CreateFPTrunc(Src, DstTy, "conv"); else diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index aae61f612a4bc..74f425d32648f 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5262,8 +5262,8 @@ ExprResult Sema::ConvertVectorExpr(Expr *E, TypeSourceInfo *TInfo, << E->getSourceRange()); } - return new (Context) class ConvertVectorExpr(E, TInfo, DstTy, VK, OK, - BuiltinLoc, RParenLoc); + return ConvertVectorExpr::Create(Context, E, TInfo, DstTy, VK, OK, BuiltinLoc, + RParenLoc, CurFPFeatureOverrides()); } bool Sema::BuiltinPrefetch(CallExpr *TheCall) { diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index fba54023a6bb2..835ad4a658944 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -1387,10 +1387,15 @@ void ASTStmtReader::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { void ASTStmtReader::VisitConvertVectorExpr(ConvertVectorExpr *E) { VisitExpr(E); + bool HasFPFeatures = CurrentUnpackingBits->getNextBit(); + assert(HasFPFeatures == E->hasStoredFPFeatures()); E->BuiltinLoc = readSourceLocation(); E->RParenLoc = readSourceLocation(); E->TInfo = readTypeSourceInfo(); E->SrcExpr = Record.readSubExpr(); + if (HasFPFeatures) + E->setStoredFPFeatures( + FPOptionsOverride::getFromOpaqueInt(Record.readInt())); } void ASTStmtReader::VisitBlockExpr(BlockExpr *E) { @@ -3385,9 +3390,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { S = new (Context) ShuffleVectorExpr(Empty); break; - case EXPR_CONVERT_VECTOR: - S = new (Context) ConvertVectorExpr(Empty); + case EXPR_CONVERT_VECTOR: { + BitsUnpacker ConvertVectorExprBits(Record[ASTStmtReader::NumStmtFields]); + ConvertVectorExprBits.advance(ASTStmtReader::NumExprBits); + bool HasFPFeatures = ConvertVectorExprBits.getNextBit(); + S = ConvertVectorExpr::CreateEmpty(Context, HasFPFeatures); break; + } case EXPR_BLOCK: S = new (Context) BlockExpr(Empty); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 2687231d7820f..82738d3a8c88a 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -1335,11 +1335,15 @@ void ASTStmtWriter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { void ASTStmtWriter::VisitConvertVectorExpr(ConvertVectorExpr *E) { VisitExpr(E); + bool HasFPFeatures = E->hasStoredFPFeatures(); + CurrentPackingBits.addBit(HasFPFeatures); Record.AddSourceLocation(E->getBuiltinLoc()); Record.AddSourceLocation(E->getRParenLoc()); Record.AddTypeSourceInfo(E->getTypeSourceInfo()); Record.AddStmt(E->getSrcExpr()); Code = serialization::EXPR_CONVERT_VECTOR; + if (HasFPFeatures) + Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt()); } void ASTStmtWriter::VisitBlockExpr(BlockExpr *E) { diff --git a/clang/test/AST/ast-dump-fpfeatures.cpp b/clang/test/AST/ast-dump-fpfeatures.cpp index cd00650db55cc..eeead3462c0ec 100644 --- a/clang/test/AST/ast-dump-fpfeatures.cpp +++ b/clang/test/AST/ast-dump-fpfeatures.cpp @@ -8,6 +8,17 @@ // RUN: | sed -e "s/ //" -e "s/ imported//" \ // RUN: | FileCheck --strict-whitespace %s +// CHECK-LABEL: FunctionDecl {{.*}} no_fpfeatures_func_01 'vector2float (vector2double)' +// CHECK: CompoundStmt {{.*\>$}} +// CHECK: ReturnStmt +// CHECK: ConvertVectorExpr {{.*}} 'vector2float':'__attribute__((__vector_size__(2 * sizeof(float)))) float'{{$}} + +typedef double vector2double __attribute__((__vector_size__(16))); +typedef float vector2float __attribute__((__vector_size__(8))); +vector2float no_fpfeatures_func_01(vector2double x) { + return __builtin_convertvector(x, vector2float); +} + float func_01(float x); template @@ -248,4 +259,14 @@ __attribute__((optnone)) T func_22(T x, T y) { float func_23(float x, float y) { return func_22(x, y); -} \ No newline at end of file +} + +// CHECK-LABEL: FunctionDecl {{.*}} func_24 'vector2float (vector2double)' +// CHECK: CompoundStmt {{.*}} FPContractMode=2 ConstRoundingMode=towardzero +// CHECK: ReturnStmt +// CHECK: ConvertVectorExpr {{.*}} FPContractMode=2 ConstRoundingMode=towardzero + +#pragma STDC FENV_ROUND FE_TOWARDZERO +vector2float func_24(vector2double x) { + return __builtin_convertvector(x, vector2float); +} diff --git a/clang/test/AST/const-fpfeatures.c b/clang/test/AST/const-fpfeatures.c index 8dc3221b0638a..787bb989dd4a2 100644 --- a/clang/test/AST/const-fpfeatures.c +++ b/clang/test/AST/const-fpfeatures.c @@ -22,6 +22,12 @@ float _Complex C1u = C0; float FLu = 0.1F; // CHECK: @FLu = {{.*}} float 0x3FB99999A0000000 +typedef float vector2float __attribute__((__vector_size__(8))); +typedef double vector2double __attribute__((__vector_size__(16))); +const vector2float V2Fu = {1.0F + 0x0.000001p0F, 1.0F + 0x0.000002p0F}; +vector2double V2Du = __builtin_convertvector(V2Fu, vector2double); +// CHECK: @V2Fu = {{.*}} <2 x float> splat (float 0x3FF0000020000000) +// CHECK: @V2Du = {{.*}} <2 x double> splat (double 0x3FF0000020000000) #pragma STDC FENV_ROUND FE_DOWNWARD @@ -41,3 +47,8 @@ float _Complex C1d = C0; float FLd = 0.1F; // CHECK: @FLd = {{.*}} float 0x3FB9999980000000 + +const vector2float V2Fd = {1.0F + 0x0.000001p0F, 1.0F + 0x0.000002p0F}; +vector2double V2Dd = __builtin_convertvector(V2Fd, vector2double); +// CHECK: @V2Fd = {{.*}} <2 x float> +// CHECK: @V2Dd = {{.*}} <2 x double> diff --git a/clang/test/CodeGen/pragma-fenv_access.c b/clang/test/CodeGen/pragma-fenv_access.c index afca115ed08d1..347e9670c4742 100644 --- a/clang/test/CodeGen/pragma-fenv_access.c +++ b/clang/test/CodeGen/pragma-fenv_access.c @@ -242,3 +242,12 @@ float func_20(float x, float y) { // CHECK-LABEL: @func_20 // STRICT: call float @llvm.experimental.constrained.fadd.f32(float {{.*}}, float {{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // DEFAULT: fadd float + +typedef double vector4double __attribute__((__vector_size__(32))); +typedef float vector4float __attribute__((__vector_size__(16))); +vector4float func_21(vector4double x) { + #pragma STDC FENV_ROUND FE_UPWARD + return __builtin_convertvector(x, vector4float); +} +// CHECK-LABEL: @func_21 +// STRICT: call <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(<4 x double> {{.*}}, metadata !"round.upward", metadata !"fpexcept.strict") From ddf24086f119cacf2a0fc489773f8af302f4a489 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 19 Feb 2025 18:05:22 +0100 Subject: [PATCH 084/220] [AMDGPU] Remove unused variables. NFC --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2691a4135b6f2..2cf6de73fa90c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3607,13 +3607,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src0 = &UseMI.getOperand(Src0Idx); - bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; - bool IsFMA = - Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -4002,17 +3995,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, "present pre-RA"); // Handle MAC/FMAC. - bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64; - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || - Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || - Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || From 75ea7aed93ec8afa43634a41c2e94380ba0d671e Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Wed, 19 Feb 2025 09:08:37 -0800 Subject: [PATCH 085/220] [CIR] Add additional frontend actions (#127249) Add frontend actions to support emitting assembly, bitcode, and object files when compiling with ClangIR. This change also correctly sets and propagates the target triple in the MLIR and LLVM modules, which was a necessary prerequisite for emitting assembly and object files. --- .../clang/CIR/Dialect/IR/CIRDialect.td | 2 ++ .../clang/CIR/FrontendAction/CIRGenAction.h | 24 +++++++++++++++ clang/lib/CIR/CodeGen/CIRGenModule.cpp | 3 ++ clang/lib/CIR/CodeGen/CIRGenModule.h | 4 +++ clang/lib/CIR/FrontendAction/CIRGenAction.cpp | 29 ++++++++++++++++++- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 11 +++++++ .../ExecuteCompilerInvocation.cpp | 21 ++++++++++++-- clang/test/CIR/emit-actions.cpp | 21 ++++++++++++++ 8 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 clang/test/CIR/emit-actions.cpp diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td index 305a06427ed0e..73759cfa9c3c9 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td @@ -28,6 +28,8 @@ def CIR_Dialect : Dialect { let useDefaultTypePrinterParser = 0; let extraClassDeclaration = [{ + static llvm::StringRef getTripleAttrName() { return "cir.triple"; } + void registerAttributes(); void registerTypes(); diff --git a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h index 5f9110bc83b89..99495f4718c5f 100644 --- a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h +++ b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h @@ -25,8 +25,11 @@ class CIRGenConsumer; class CIRGenAction : public clang::ASTFrontendAction { public: enum class OutputType { + EmitAssembly, EmitCIR, EmitLLVM, + EmitBC, + EmitObj, }; private: @@ -63,6 +66,27 @@ class EmitLLVMAction : public CIRGenAction { EmitLLVMAction(mlir::MLIRContext *MLIRCtx = nullptr); }; +class EmitBCAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitBCAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +class EmitAssemblyAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitAssemblyAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +class EmitObjAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitObjAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + } // namespace cir #endif diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 2615ae382cb8b..cbecdf925aa5d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -52,6 +52,9 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, DoubleTy = cir::DoubleType::get(&getMLIRContext()); FP80Ty = cir::FP80Type::get(&getMLIRContext()); FP128Ty = cir::FP128Type::get(&getMLIRContext()); + + theModule->setAttr(cir::CIRDialect::getTripleAttrName(), + builder.getStringAttr(getTriple().str())); } mlir::Location CIRGenModule::getLoc(SourceLocation cLoc) { diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 1c7ed63773900..29bb4036218e4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -21,7 +21,9 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" #include "llvm/ADT/StringRef.h" +#include "llvm/TargetParser/Triple.h" namespace clang { class ASTContext; @@ -88,6 +90,8 @@ class CIRGenModule : public CIRGenTypeCache { void emitGlobalVarDefinition(const clang::VarDecl *vd, bool isTentative = false); + const llvm::Triple &getTriple() const { return target.getTriple(); } + /// Helpers to emit "not yet implemented" error diagnostics DiagnosticBuilder errorNYI(SourceLocation, llvm::StringRef); diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp index eab6958ac8f6d..0f686a36b982b 100644 --- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp +++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp @@ -27,8 +27,14 @@ getBackendActionFromOutputType(CIRGenAction::OutputType Action) { assert(false && "Unsupported output type for getBackendActionFromOutputType!"); break; // Unreachable, but fall through to report that + case CIRGenAction::OutputType::EmitAssembly: + return BackendAction::Backend_EmitAssembly; + case CIRGenAction::OutputType::EmitBC: + return BackendAction::Backend_EmitBC; case CIRGenAction::OutputType::EmitLLVM: return BackendAction::Backend_EmitLL; + case CIRGenAction::OutputType::EmitObj: + return BackendAction::Backend_EmitObj; } // We should only get here if a non-enum value is passed in or we went through // the assert(false) case above @@ -84,7 +90,10 @@ class CIRGenConsumer : public clang::ASTConsumer { MlirModule->print(*OutputStream, Flags); } break; - case CIRGenAction::OutputType::EmitLLVM: { + case CIRGenAction::OutputType::EmitLLVM: + case CIRGenAction::OutputType::EmitBC: + case CIRGenAction::OutputType::EmitObj: + case CIRGenAction::OutputType::EmitAssembly: { llvm::LLVMContext LLVMCtx; std::unique_ptr LLVMModule = lowerFromCIRToLLVMIR(MlirModule, LLVMCtx); @@ -111,10 +120,16 @@ static std::unique_ptr getOutputStream(CompilerInstance &CI, StringRef InFile, CIRGenAction::OutputType Action) { switch (Action) { + case CIRGenAction::OutputType::EmitAssembly: + return CI.createDefaultOutputFile(false, InFile, "s"); case CIRGenAction::OutputType::EmitCIR: return CI.createDefaultOutputFile(false, InFile, "cir"); case CIRGenAction::OutputType::EmitLLVM: return CI.createDefaultOutputFile(false, InFile, "ll"); + case CIRGenAction::OutputType::EmitBC: + return CI.createDefaultOutputFile(true, InFile, "bc"); + case CIRGenAction::OutputType::EmitObj: + return CI.createDefaultOutputFile(true, InFile, "o"); } llvm_unreachable("Invalid CIRGenAction::OutputType"); } @@ -132,6 +147,10 @@ CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { return Result; } +void EmitAssemblyAction::anchor() {} +EmitAssemblyAction::EmitAssemblyAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitAssembly, MLIRCtx) {} + void EmitCIRAction::anchor() {} EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) : CIRGenAction(OutputType::EmitCIR, MLIRCtx) {} @@ -139,3 +158,11 @@ EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) void EmitLLVMAction::anchor() {} EmitLLVMAction::EmitLLVMAction(mlir::MLIRContext *MLIRCtx) : CIRGenAction(OutputType::EmitLLVM, MLIRCtx) {} + +void EmitBCAction::anchor() {} +EmitBCAction::EmitBCAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitBC, MLIRCtx) {} + +void EmitObjAction::anchor() {} +EmitObjAction::EmitObjAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitObj, MLIRCtx) {} diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 74ff89346f3c4..235b5a057852a 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -114,6 +114,8 @@ struct ConvertCIRToLLVMPass } void runOnOperation() final; + void processCIRAttrs(mlir::ModuleOp module); + StringRef getDescription() const override { return "Convert the prepared CIR dialect module to LLVM dialect"; } @@ -271,6 +273,13 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter, }); } +void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) { + // Lower the module attributes to LLVM equivalents. + if (auto tripleAttr = module->getAttr(cir::CIRDialect::getTripleAttrName())) + module->setAttr(mlir::LLVM::LLVMDialect::getTargetTripleAttrName(), + tripleAttr); +} + void ConvertCIRToLLVMPass::runOnOperation() { llvm::TimeTraceScope scope("Convert CIR to LLVM Pass"); @@ -283,6 +292,8 @@ void ConvertCIRToLLVMPass::runOnOperation() { patterns.add(converter, patterns.getContext(), dl); + processCIRAttrs(module); + mlir::ConversionTarget target(getContext()); target.addLegalOp(); target.addLegalDialect(); diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index c8d004163b96d..bb3bb0aac78bf 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -62,8 +62,18 @@ CreateFrontendBaseAction(CompilerInstance &CI) { return std::make_unique(); case DumpRawTokens: return std::make_unique(); case DumpTokens: return std::make_unique(); - case EmitAssembly: return std::make_unique(); - case EmitBC: return std::make_unique(); + case EmitAssembly: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); + case EmitBC: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); case EmitCIR: #if CLANG_ENABLE_CIR return std::make_unique(); @@ -80,7 +90,12 @@ CreateFrontendBaseAction(CompilerInstance &CI) { } case EmitLLVMOnly: return std::make_unique(); case EmitCodeGenOnly: return std::make_unique(); - case EmitObj: return std::make_unique(); + case EmitObj: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); case ExtractAPI: return std::make_unique(); case FixIt: return std::make_unique(); diff --git a/clang/test/CIR/emit-actions.cpp b/clang/test/CIR/emit-actions.cpp new file mode 100644 index 0000000000000..94ddf23b34753 --- /dev/null +++ b/clang/test/CIR/emit-actions.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -S %s -o - | FileCheck %s -check-prefix=ASM + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm-bc %s -o %t.bc +// RUN: llvm-dis %t.bc -o - | FileCheck %s -check-prefix=BC + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-obj %s -o %t.o +// RUN: llvm-objdump -t %t.o | FileCheck %s -check-prefix=OBJ + +// TODO: Make this test target-independent +// REQUIRES: x86-registered-target + +int x = 1; + +// BC: @x = dso_local global i32 1 + +// ASM: x: +// ASM: .long 1 +// ASM: .size x, 4 + +// OBJ: .data +// OBJ-SAME: x From 8fc03e4ff1b33816364dda6986032cbbe99a9462 Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Wed, 19 Feb 2025 18:09:24 +0100 Subject: [PATCH 086/220] [InstCombine] avoid extra instructions in foldSelectICmpAnd (#127398) Disable fold when it will result in more instructions. --- .../InstCombine/InstCombineSelect.cpp | 14 +++++++---- .../Transforms/InstCombine/select-icmp-and.ll | 23 ++++++++----------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index cf38fc5f058f2..1b80e3555fbea 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -205,11 +205,15 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, unsigned ValZeros = ValC.logBase2(); unsigned AndZeros = AndMask.logBase2(); bool ShouldNotVal = !TC.isZero(); - - // If we would need to create an 'and' + 'shift' + 'xor' to replace a 'select' - // + 'icmp', then this transformation would result in more instructions and - // potentially interfere with other folding. - if (CreateAnd && ShouldNotVal && ValZeros != AndZeros) + bool NeedShift = ValZeros != AndZeros; + bool NeedZExtTrunc = + SelType->getScalarSizeInBits() != V->getType()->getScalarSizeInBits(); + + // If we would need to create an 'and' + 'shift' + 'xor' + cast to replace + // a 'select' + 'icmp', then this transformation would result in more + // instructions and potentially interfere with other folding. + if (CreateAnd + ShouldNotVal + NeedShift + NeedZExtTrunc > + 1 + Cmp->hasOneUse()) return nullptr; // Insert the 'and' instruction on the input to the truncate. diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll index 1218799ab3dc5..e49c2f6214114 100644 --- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll +++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll @@ -391,9 +391,8 @@ define i32 @test15e_extra_use(i32 %X) { ;; (a & 128) ? 256 : 0 define i32 @test15e_zext(i8 %X) { ; CHECK-LABEL: @test15e_zext( -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[T2_NOT:%.*]] = icmp sgt i8 [[X:%.*]], -1 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2_NOT]], i32 0, i32 256 ; CHECK-NEXT: ret i32 [[T3]] ; %t1 = and i8 %X, 128 @@ -406,9 +405,7 @@ define i32 @test15e_zext(i8 %X) { define i32 @test15e_zext_extra_use(i8 %X) { ; CHECK-LABEL: @test15e_zext_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 256, i32 0 ; CHECK-NEXT: call void @use1(i1 [[T2]]) ; CHECK-NEXT: ret i32 [[T3]] ; @@ -438,8 +435,7 @@ define i32 @test15f_extra_use(i32 %X) { ; CHECK-LABEL: @test15f_extra_use( ; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 128 ; CHECK-NEXT: [[T2:%.*]] = icmp ne i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[T1]], 1 -; CHECK-NEXT: [[T3:%.*]] = xor i32 [[TMP1]], 256 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 0, i32 256 ; CHECK-NEXT: call void @use1(i1 [[T2]]) ; CHECK-NEXT: ret i32 [[T3]] ; @@ -453,10 +449,9 @@ define i32 @test15f_extra_use(i32 %X) { ;; (a & 128) ? 0 : 256 define i16 @test15f_trunc(i32 %X) { ; CHECK-LABEL: @test15f_trunc( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = shl i16 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP2]], 256 -; CHECK-NEXT: [[T3:%.*]] = xor i16 [[TMP3]], 256 +; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 128 +; CHECK-NEXT: [[T2_NOT:%.*]] = icmp eq i32 [[T1]], 0 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2_NOT]], i16 256, i16 0 ; CHECK-NEXT: ret i16 [[T3]] ; %t1 = and i32 %X, 128 @@ -799,7 +794,9 @@ define i8 @select_bittest_to_xor(i8 %x) { ; CHECK-LABEL: @select_bittest_to_xor( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use1(i1 [[CMP]]) -; CHECK-NEXT: [[MASKSEL:%.*]] = xor i8 [[X]], -128 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 127 +; CHECK-NEXT: [[MASKSEL1:%.*]] = select i1 [[CMP]], i8 -128, i8 0 +; CHECK-NEXT: [[MASKSEL:%.*]] = or disjoint i8 [[AND]], [[MASKSEL1]] ; CHECK-NEXT: ret i8 [[MASKSEL]] ; %cmp = icmp sgt i8 %x, -1 From 0ffe270d0e1dadc3e16f8fa79882fc3b31d28559 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 19 Feb 2025 09:15:26 -0800 Subject: [PATCH 087/220] [ELF,test] Remove unneeded -o /dev/null When the script has executed `cd %t`, it is fine to to use the output file `a.out`. (We don't want to rely on lit's default PWD to support lit compatible runners. Therefore -o /dev/null is used when PWD has not been changed to a %t derived path.) --- lld/test/ELF/aarch64-feature-gcs.s | 18 +++++----- lld/test/ELF/allow-shlib-undefined-weak.s | 2 +- lld/test/ELF/allow-shlib-undefined.s | 36 +++++++++---------- lld/test/ELF/arm-cmse-diagnostics.s | 18 +++++----- lld/test/ELF/avr-reloc-error.s | 6 ++-- lld/test/ELF/common-archive-lookup.s | 2 +- lld/test/ELF/duplicated-synthetic-sym.s | 12 +++---- lld/test/ELF/linkerscript/discard-section.s | 4 +-- lld/test/ELF/linkerscript/input-relative.s | 4 +-- .../locationcountererr-arm-exidx.test | 2 +- lld/test/ELF/lto/archive-mixed.test | 16 ++++----- lld/test/ELF/lto/obj-path.ll | 4 +-- lld/test/ELF/lto/parallel.ll | 2 +- lld/test/ELF/non-abs-reloc.s | 4 +-- lld/test/ELF/print-archive-stats.s | 8 ++--- lld/test/ELF/remap-inputs.test | 12 +++---- lld/test/ELF/reproduce-deplibs.s | 2 +- lld/test/ELF/reproduce-lto.s | 6 ++-- lld/test/ELF/riscv-attributes.s | 8 ++--- lld/test/ELF/unknown-section.test | 2 +- lld/test/ELF/why-extract.s | 24 ++++++------- 21 files changed, 95 insertions(+), 97 deletions(-) diff --git a/lld/test/ELF/aarch64-feature-gcs.s b/lld/test/ELF/aarch64-feature-gcs.s index 7a08673dbb7e6..b53a653dddaee 100644 --- a/lld/test/ELF/aarch64-feature-gcs.s +++ b/lld/test/ELF/aarch64-feature-gcs.s @@ -36,15 +36,15 @@ ## gcs-report should report any input files that don't have the gcs property. -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | count 0 -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | count 0 -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning -z gcs=always 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning -z gcs=never 2>&1 | count 0 # REPORT-WARN: warning: func2.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property # REPORT-ERROR: error: func3.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property diff --git a/lld/test/ELF/allow-shlib-undefined-weak.s b/lld/test/ELF/allow-shlib-undefined-weak.s index 1037cbed0d859..141881fd73673 100644 --- a/lld/test/ELF/allow-shlib-undefined-weak.s +++ b/lld/test/ELF/allow-shlib-undefined-weak.s @@ -21,7 +21,7 @@ # RUN: ld.lld -shared wrap.o def.so -o wrap.so # RUN: llvm-mc -filetype=obj -triple=x86_64 start.s -o start.o -# RUN: ld.lld --no-allow-shlib-undefined start.o wrap.so ref.so -o /dev/null 2>&1 | count 0 +# RUN: ld.lld --no-allow-shlib-undefined start.o wrap.so ref.so 2>&1 | count 0 #--- start.s .globl _start diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s index c69c1ea20ce3b..a088c2595d538 100644 --- a/lld/test/ELF/allow-shlib-undefined.s +++ b/lld/test/ELF/allow-shlib-undefined.s @@ -9,40 +9,40 @@ # RUN: cp a.so b.so # RUN: llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o && ld.lld -shared empty.o -o empty.so -# RUN: ld.lld --allow-shlib-undefined main.o a.so -o /dev/null -# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --allow-shlib-undefined main.o a.so +# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so 2>&1 | FileCheck %s ## Executable linking defaults to --no-allow-shlib-undefined. -# RUN: not ld.lld main.o a.so -o /dev/null 2>&1 | FileCheck %s -# RUN: ld.lld main.o a.so --noinhibit-exec -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN -# RUN: ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN +# RUN: not ld.lld main.o a.so 2>&1 | FileCheck %s +# RUN: ld.lld main.o a.so --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=WARN +# RUN: ld.lld main.o a.so --warn-unresolved-symbols 2>&1 | FileCheck %s --check-prefix=WARN ## -shared linking defaults to --allow-shlib-undefined. -# RUN: ld.lld -shared main.o a.so -o /dev/null +# RUN: ld.lld -shared main.o a.so ## DSO with undefines should link with or without any of these options. -# RUN: ld.lld -shared --allow-shlib-undefined a.o -o /dev/null -# RUN: ld.lld -shared --no-allow-shlib-undefined a.o -o /dev/null +# RUN: ld.lld -shared --allow-shlib-undefined a.o +# RUN: ld.lld -shared --no-allow-shlib-undefined a.o ## Perform checking even if an unresolved symbol is first seen in a regular object file. -# RUN: not ld.lld --gc-sections main.o ref.o a.so -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld --gc-sections main.o ref.o a.so 2>&1 | FileCheck %s ## Check that the error is reported for each shared library where the symbol ## is referenced. -# RUN: not ld.lld main.o a.so empty.so b.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2 +# RUN: not ld.lld main.o a.so empty.so b.so 2>&1 | FileCheck %s --check-prefix=CHECK2 ## Test some cases when a relocatable object file provides a non-exported definition. -# RUN: not ld.lld main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o def-hidden.o a.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings -o /dev/null +# RUN: not ld.lld main.o a.so def-hidden.o 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings ## Test a relocatable object file definition that is converted to STB_LOCAL. -# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver 2>&1 | FileCheck %s --check-prefix=NONEXPORTED ## The section containing the definition is discarded, and we report an error. -# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o 2>&1 | FileCheck %s ## The definition def.so is ignored. # RUN: ld.lld -shared def.o -o def.so -# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings -o /dev/null +# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings # CHECK-NOT: error: # CHECK: error: undefined reference: x1{{$}} diff --git a/lld/test/ELF/arm-cmse-diagnostics.s b/lld/test/ELF/arm-cmse-diagnostics.s index d30f2431cc57a..4c8a4097e8250 100644 --- a/lld/test/ELF/arm-cmse-diagnostics.s +++ b/lld/test/ELF/arm-cmse-diagnostics.s @@ -7,11 +7,11 @@ // RUN: llvm-mc -arm-add-build-attributes -filetype=obj --triple=thumbv8m.base lib -o lib.o // RUN: llvm-mc -arm-add-build-attributes -filetype=obj --triple=thumbv8m.base app -I %S/Inputs -o app.o // RUN: llvm-objcopy --redefine-sym=entry7_duplicate=entry6_duplicate lib.o -// RUN: not ld.lld --cmse-implib --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IMPLIB -// RUN: not ld.lld --cmse-implib --in-implib=lib.o --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_MULT_INIMPLIB -// RUN: not ld.lld --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB -// RUN: not ld.lld --out-implib=out.lib app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_OUT_IMPLIB -// RUN: not ld.lld --out-implib=out.lib --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB,ERR_OUT_IMPLIB +// RUN: not ld.lld --cmse-implib --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IMPLIB +// RUN: not ld.lld --cmse-implib --in-implib=lib.o --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_MULT_INIMPLIB +// RUN: not ld.lld --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB +// RUN: not ld.lld --out-implib=out.lib app.o 2>&1 | FileCheck %s --check-prefixes=ERR_OUT_IMPLIB +// RUN: not ld.lld --out-implib=out.lib --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB,ERR_OUT_IMPLIB // ERR_IMPLIB: error: CMSE symbol 'entry_not_external' in import library '{{.*}}' is not global // ERR_IMPLIB: error: CMSE symbol 'entry_not_absolute' in import library '{{.*}}' is not absolute @@ -91,7 +91,7 @@ /// Test diagnostics emitted during symbol attribute checks. // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base symattr -o symattr.o -// RUN: not ld.lld --cmse-implib symattr.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_SYMATTR +// RUN: not ld.lld --cmse-implib symattr.o 2>&1 | FileCheck %s --check-prefixes=ERR_SYMATTR // ERR_SYMATTR-NOT: __acle_se_valid_{{.*}} // ERR_SYMATTR: error: {{.*}}: cmse special symbol '__acle_se_invalid_1' is not a Thumb function definition @@ -110,9 +110,9 @@ /// Test diagnostics emitted when a symbol is removed from a later version of the import library. // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base libv1 -o libv1.o // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base libv2 -o libv2.o -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --out-implib=libv1.lib -o /dev/null -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv2.o --in-implib=libv1.lib --out-implib=libv2.lib -o /dev/null 2>&1 | FileCheck %s --check-prefixes=WARN_MISSING -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --in-implib=libv2.lib -o /dev/null 2>&1 | FileCheck %s --check-prefixes=WARN_NEWENTRY +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --out-implib=libv1.lib +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv2.o --in-implib=libv1.lib --out-implib=libv2.lib 2>&1 | FileCheck %s --check-prefixes=WARN_MISSING +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --in-implib=libv2.lib 2>&1 | FileCheck %s --check-prefixes=WARN_NEWENTRY // WARN_MISSING: warning: entry function 'bar' from CMSE import library is not present in secure application // WARN_NEWENTRY: warning: new entry function 'bar' introduced but no output import library specified diff --git a/lld/test/ELF/avr-reloc-error.s b/lld/test/ELF/avr-reloc-error.s index f177e44f753fa..b36a24d764c5c 100644 --- a/lld/test/ELF/avr-reloc-error.s +++ b/lld/test/ELF/avr-reloc-error.s @@ -3,13 +3,13 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-7.s -o avr-pcrel-7.o -# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \ +# RUN: not ld.lld avr-pcrel-7.o -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \ # RUN: FileCheck %s --check-prefix=PCREL7 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-13.s -o avr-pcrel-13.o -# RUN: not ld.lld avr-pcrel-13.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \ +# RUN: not ld.lld avr-pcrel-13.o -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \ # RUN: FileCheck %s --check-prefix=PCREL13 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-abs.s -o avr-abs.o -# RUN: not ld.lld avr-abs.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1009 --defsym=callee1=0x1010 2>&1 | \ +# RUN: not ld.lld avr-abs.o -Ttext=0x1000 --defsym=callee0=0x1009 --defsym=callee1=0x1010 2>&1 | \ # RUN: FileCheck %s --check-prefix=ABS #--- avr-pcrel-7.s diff --git a/lld/test/ELF/common-archive-lookup.s b/lld/test/ELF/common-archive-lookup.s index a30d0f18d01ad..9834d13ed7c24 100644 --- a/lld/test/ELF/common-archive-lookup.s +++ b/lld/test/ELF/common-archive-lookup.s @@ -69,7 +69,7 @@ # RUN: FileCheck --check-prefix=ASM %s < out.lto.s ## COMMON overrides weak. Don't extract 3.bc which provides a weak definition. -# RUN: ld.lld -o /dev/null main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s +# RUN: ld.lld main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s ## Old FORTRAN that mixes use of COMMON blocks and BLOCK DATA requires that we ## search through archives for non-tentative definitions (from the BLOCK DATA) diff --git a/lld/test/ELF/duplicated-synthetic-sym.s b/lld/test/ELF/duplicated-synthetic-sym.s index d08af3a1a52e5..9d47ec10f797f 100644 --- a/lld/test/ELF/duplicated-synthetic-sym.s +++ b/lld/test/ELF/duplicated-synthetic-sym.s @@ -1,14 +1,12 @@ // REQUIRES: x86 -// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: rm -rf %t.dir -// RUN: mkdir %t.dir -// RUN: cd %t.dir +// RUN: rm -rf %t && mkdir %t && cd %t +// RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o // RUN: echo > file.bin -// RUN: not ld.lld %t.o --format=binary file.bin -o /dev/null 2>&1 | FileCheck %s -// RUN: not ld.lld %t.o --format binary file.bin -o /dev/null 2>&1 | FileCheck %s +// RUN: not ld.lld a.o --format=binary file.bin 2>&1 | FileCheck %s +// RUN: not ld.lld a.o --format binary file.bin 2>&1 | FileCheck %s -// CHECK: duplicate symbol: _binary_file_bin_start +// CHECK: error: duplicate symbol: _binary_file_bin_start // CHECK-NEXT: defined in {{.*}}.o // CHECK-NEXT: defined in file.bin diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s index 0bbebac59bb34..d6dd8a5347e94 100644 --- a/lld/test/ELF/linkerscript/discard-section.s +++ b/lld/test/ELF/linkerscript/discard-section.s @@ -4,8 +4,8 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o -# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -z undefs -o /dev/null 2>&1 | FileCheck %s --check-prefix=LOCAL --implicit-check-not=error: -# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=LOCAL,NONLOCAL --implicit-check-not=error: +# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -z undefs 2>&1 | FileCheck %s --check-prefix=LOCAL --implicit-check-not=error: +# RUN: not ld.lld --threads=1 -T a.lds a.o b.o 2>&1 | FileCheck %s --check-prefixes=LOCAL,NONLOCAL --implicit-check-not=error: # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning: # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC diff --git a/lld/test/ELF/linkerscript/input-relative.s b/lld/test/ELF/linkerscript/input-relative.s index 771684c7c4f82..3f81c5f3ee9e3 100644 --- a/lld/test/ELF/linkerscript/input-relative.s +++ b/lld/test/ELF/linkerscript/input-relative.s @@ -31,13 +31,13 @@ ## The rules does not apply to an absolute path. # RUN: echo 'INPUT(/libb.a)' > dir/absolute.lds -# RUN: not ld.lld a.o dir/absolute.lds -o /dev/null +# RUN: not ld.lld a.o dir/absolute.lds ## If the parent directory of the current linker script does not contain the file, ## fall back to the current working directory. # RUN: cp libb.a libc.a # RUN: echo 'INPUT(libc.a)' > dir/fallback.lds -# RUN: ld.lld a.o dir/fallback.lds -o /dev/null +# RUN: ld.lld a.o dir/fallback.lds .globl _start _start: diff --git a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test index c82a93efc1aae..7a18015cfcab4 100644 --- a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test +++ b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test @@ -5,7 +5,7 @@ ## If we don't merge adjacent duplicate entries, __code_size will be negative and ## . += __code_size will trigger a "move location counter backward" error. ## LLD may report more errors further down, but there is only one "move location counter backward" error. -# RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o -o /dev/null --no-merge-exidx-entries 2>&1 | \ +# RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o --no-merge-exidx-entries 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR --implicit-check-not=error: # ERR: error: a.t:9: unable to move location counter (0x1000) backward to 0xf6c for section 'dummy1' diff --git a/lld/test/ELF/lto/archive-mixed.test b/lld/test/ELF/lto/archive-mixed.test index fbb84a1d8bb76..6f1db87c89ca1 100644 --- a/lld/test/ELF/lto/archive-mixed.test +++ b/lld/test/ELF/lto/archive-mixed.test @@ -19,22 +19,22 @@ ; RUN: llvm-ar rc other.bc.a a.bc ; RUN: llvm-ar rc other.o.a a.o -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.bc.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.bc.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.bc.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.bc.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.o.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.o.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.o.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.o.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.bc.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.o.b.bc.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.bc.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.o.b.bc.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.o.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.o.b.o.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} ;; Not an LTO test case, but here for completeness. -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.o.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.o.b.o.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} ; CHECK: ref.o diff --git a/lld/test/ELF/lto/obj-path.ll b/lld/test/ELF/lto/obj-path.ll index c0bb4addf2466..bf291ff8a0458 100644 --- a/lld/test/ELF/lto/obj-path.ll +++ b/lld/test/ELF/lto/obj-path.ll @@ -54,14 +54,14 @@ ;; With --thinlto-index-only, --lto-obj-path= creates just one file. ; RUN: rm -f objpath.o objpath.o1 objpath.o2 -; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null +; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc ; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY ; RUN: not ls objpath.o1 ; RUN: not ls objpath.o2 ;; Test --plugin-opt=obj-path=. ; RUN: rm -f objpath.o -; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null +; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc ; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY ;; Ensure lld emits empty combined module if specific obj-path. diff --git a/lld/test/ELF/lto/parallel.ll b/lld/test/ELF/lto/parallel.ll index 6b2c352b0a965..e32225c3ed3b8 100644 --- a/lld/test/ELF/lto/parallel.ll +++ b/lld/test/ELF/lto/parallel.ll @@ -5,7 +5,7 @@ ; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-nm out.lto.1.o | FileCheck --check-prefix=CHECK1 %s -; RUN: not ld.lld --lto-partitions=0 a.bc -o /dev/null 2>&1 | FileCheck --check-prefix=INVALID %s +; RUN: not ld.lld --lto-partitions=0 a.bc 2>&1 | FileCheck --check-prefix=INVALID %s ; INVALID: --lto-partitions: number of threads must be > 0 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/lld/test/ELF/non-abs-reloc.s b/lld/test/ELF/non-abs-reloc.s index 42b5f8fec1c43..e37a0ec12414b 100644 --- a/lld/test/ELF/non-abs-reloc.s +++ b/lld/test/ELF/non-abs-reloc.s @@ -15,13 +15,13 @@ // DISASM-NEXT: 6: call{{.}} 0x5 /// There is currently no error for -r. See also https://github.com/ClangBuiltLinux/linux/issues/1937 -// RUN: ld.lld -T lds -r a.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=REL-R --implicit-check-not=warning: +// RUN: ld.lld -T lds -r a.o 2>&1 | FileCheck %s --check-prefix=REL-R --implicit-check-not=warning: // REL-R: warning: {{.*}}:(.nonalloc1+0xa): has non-ABS relocation R_386_PC32 against symbol '' // RUN: llvm-mc -filetype=obj -triple=x86_64 asm -o b.o // RUN: ld.lld -T lds b.o -o b 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=warning: // RUN: llvm-objdump -D --no-show-raw-insn b | FileCheck --check-prefix=DISASM %s -// RUN: ld.lld -T lds -r b.o -o /dev/null --fatal-warnings +// RUN: ld.lld -T lds -r b.o --fatal-warnings // CHECK2: warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_X86_64_PC32 against symbol '_start' // CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_X86_64_PC32 against symbol 'ifunc' // CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0xa): has non-ABS relocation R_X86_64_PC32 against symbol '' diff --git a/lld/test/ELF/print-archive-stats.s b/lld/test/ELF/print-archive-stats.s index 2dd236f8e0a1f..5116685623ce2 100644 --- a/lld/test/ELF/print-archive-stats.s +++ b/lld/test/ELF/print-archive-stats.s @@ -10,7 +10,7 @@ # RUN: llvm-ar rc 1.a 1.o 2.o 3.o # RUN: llvm-ar rc lib2.a -# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=a.txt -o /dev/null +# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=a.txt # RUN: FileCheck --input-file=a.txt -DT=%t %s --match-full-lines --strict-whitespace ## Fetches 0 member from %t/weak.a and 2 members from %t1.a @@ -20,10 +20,10 @@ # CHECK-NEXT:0 0 {{.*}}lib2.a ## - means stdout. -# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=- -o /dev/null | diff a.txt - +# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=- | diff a.txt - ## The second 1.a has 0 fetched member. -# RUN: ld.lld a.o %t/weak.a -L. -l:1.a -l:1.a --print-archive-stats=- -o /dev/null | \ +# RUN: ld.lld a.o %t/weak.a -L. -l:1.a -l:1.a --print-archive-stats=- | \ # RUN: FileCheck --check-prefix=CHECK2 %s # CHECK2: members extracted archive # CHECK2-NEXT: 1 0 {{.*}}weak.a @@ -31,7 +31,7 @@ # CHECK2-NEXT: 3 0 {{.*}}1.a # CHECK2-NEXT: 0 0 {{.*}}lib2.a -# RUN: not ld.lld -shared a.o -L. --print-archive-stats=/ -o /dev/null 2>&1 | FileCheck --check-prefix=ERR %s +# RUN: not ld.lld -shared a.o -L. --print-archive-stats=/ 2>&1 | FileCheck --check-prefix=ERR %s # ERR: error: --print-archive-stats=: cannot open /: {{.*}} #--- a.s diff --git a/lld/test/ELF/remap-inputs.test b/lld/test/ELF/remap-inputs.test index 0f9cafa987ac9..1be01c792a37b 100644 --- a/lld/test/ELF/remap-inputs.test +++ b/lld/test/ELF/remap-inputs.test @@ -17,26 +17,26 @@ # REPRO-NEXT: d.so ## --remap-inputs can also be specified multiple times. -# RUN: ld.lld --remap-inputs 'aa.o=a.o' --remap-inputs='d[d].so=d.so' aa.o b.o c.a d.so -o /dev/null +# RUN: ld.lld --remap-inputs 'aa.o=a.o' --remap-inputs='d[d].so=d.so' aa.o b.o c.a d.so ## A multiple-to-one pattern may easily cause issues. Users should be careful. -# RUN: not ld.lld --remap-inputs-file=3.map aa.o bb.bc -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=3.map aa.o bb.bc 2>&1 | \ # RUN: FileCheck %s --check-prefix=DUPLICATE --implicit-check-not=error: # DUPLICATE: error: duplicate symbol: _start -# RUN: not ld.lld --remap-inputs-file=err1.map aa.o bb.bc -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=err1.map aa.o bb.bc 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR1 --implicit-check-not=error: # ERR1: error: err1.map:2: parse error, not 'from-glob=to-file' # ERR1-NEXT: error: cannot open bb.bc: {{.*}} -# RUN: not ld.lld --remap-inputs-file=err2.map aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=err2.map aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR2 --implicit-check-not=error: # ERR2: error: err2.map:1: invalid glob pattern, unmatched '[': aa.[o # ERR2-NEXT: error: cannot open aa.o: {{.*}} -# RUN: not ld.lld --remap-inputs=aa.o aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs=aa.o aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 --implicit-check-not=error: -# RUN: not ld.lld --remap-inputs=aa.o= aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs=aa.o= aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 --implicit-check-not=error: # ERR3: error: --remap-inputs: parse error, not 'from-glob=to-file' # ERR3-NEXT: error: cannot open aa.o: {{.*}} diff --git a/lld/test/ELF/reproduce-deplibs.s b/lld/test/ELF/reproduce-deplibs.s index 06c25a2239834..48486d0e2bde7 100644 --- a/lld/test/ELF/reproduce-deplibs.s +++ b/lld/test/ELF/reproduce-deplibs.s @@ -8,7 +8,7 @@ # RUN: llvm-ar rc foo.a foo.o # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o bar.o -# RUN: ld.lld bar.o -o /dev/null --reproduce repro.tar +# RUN: ld.lld bar.o --reproduce repro.tar # RUN: tar tf repro.tar | FileCheck -DPATH='%:t.dir' %s # CHECK: [[PATH]]/foo.a diff --git a/lld/test/ELF/reproduce-lto.s b/lld/test/ELF/reproduce-lto.s index 36838f21388ef..b1a5bab122c56 100644 --- a/lld/test/ELF/reproduce-lto.s +++ b/lld/test/ELF/reproduce-lto.s @@ -5,10 +5,10 @@ # RUN: rm -rf %t.dir # RUN: mkdir -p %t.dir/build1 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.dir/build1/foo.o -# RUN: echo > %t.dir/build1/empty_profile.txt # RUN: cd %t.dir -# RUN: ld.lld build1/foo.o -o /dev/null --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o build1/foo.o +# RUN: echo > build1/empty_profile.txt +# RUN: ld.lld build1/foo.o --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt # RUN: tar tvf repro1.tar | FileCheck %s --implicit-check-not={{.}} # CHECK-DAG: {{.*}} repro1/{{.*}}/empty_profile.txt diff --git a/lld/test/ELF/riscv-attributes.s b/lld/test/ELF/riscv-attributes.s index d003a298101cb..13b2c7a24d0b8 100644 --- a/lld/test/ELF/riscv-attributes.s +++ b/lld/test/ELF/riscv-attributes.s @@ -31,7 +31,7 @@ # RUN: llvm-readobj --arch-specific out3 | FileCheck %s --check-prefix=CHECK3 # RUN: llvm-mc -filetype=obj -triple=riscv64 invalid_arch1.s -o invalid_arch1.o -# RUN: not ld.lld invalid_arch1.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID_ARCH1 --implicit-check-not=error: +# RUN: not ld.lld invalid_arch1.o 2>&1 | FileCheck %s --check-prefix=INVALID_ARCH1 --implicit-check-not=error: # INVALID_ARCH1: error: invalid_arch1.o:(.riscv.attributes): rv64i2: extension lacks version in expected format ## A zero value attribute is not printed. @@ -41,20 +41,20 @@ ## Differing stack_align values lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 diff_stack_align.s -o diff_stack_align.o -# RUN: not ld.lld a.o b.o c.o diff_stack_align.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=STACK_ALIGN --implicit-check-not=error: +# RUN: not ld.lld a.o b.o c.o diff_stack_align.o 2>&1 | FileCheck %s --check-prefix=STACK_ALIGN --implicit-check-not=error: # STACK_ALIGN: error: diff_stack_align.o:(.riscv.attributes) has stack_align=32 but a.o:(.riscv.attributes) has stack_align=16 ## RISC-V tag merging for atomic_abi values A6C and A7 lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_A6C.s -o atomic_abi_A6C.o # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_A7.s -o atomic_abi_A7.o -# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_A7.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_ERROR --implicit-check-not=error: +# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_A7.o 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_ERROR --implicit-check-not=error: # ATOMIC_ABI_ERROR: error: atomic abi mismatch for .riscv.attributes # ATOMIC_ABI_ERROR-NEXT: >>> atomic_abi_A6C.o:(.riscv.attributes): atomic_abi=1 # ATOMIC_ABI_ERROR-NEXT: >>> atomic_abi_A7.o:(.riscv.attributes): atomic_abi=3 ## RISC-V tag merging for atomic_abi values A6C and invalid lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_invalid.s -o atomic_abi_invalid.o -# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_invalid.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_INVALID --implicit-check-not=error: +# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_invalid.o 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_INVALID --implicit-check-not=error: # ATOMIC_ABI_INVALID: error: unknown atomic abi for .riscv.attributes # ATOMIC_ABI_INVALID-NEXT: >>> atomic_abi_invalid.o:(.riscv.attributes): atomic_abi=42 diff --git a/lld/test/ELF/unknown-section.test b/lld/test/ELF/unknown-section.test index f6ecca29a22ae..faf420e1fb5c4 100644 --- a/lld/test/ELF/unknown-section.test +++ b/lld/test/ELF/unknown-section.test @@ -1,6 +1,6 @@ # RUN: rm -rf %t && mkdir %t && cd %t # RUN: yaml2obj %s -o a.o -# RUN: not ld.lld a.o -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: +# RUN: not ld.lld a.o 2>&1 | FileCheck %s --implicit-check-not=error: # CHECK: error: a.o:(relr): unknown section type 0x13 # CHECK-NEXT: error: a.o:(regular): unknown section type 0x15 diff --git a/lld/test/ELF/why-extract.s b/lld/test/ELF/why-extract.s index a41db8d9fd49a..3235bce5a7167 100644 --- a/lld/test/ELF/why-extract.s +++ b/lld/test/ELF/why-extract.s @@ -12,18 +12,18 @@ # RUN: cd %t ## Nothing is extracted from an archive. The file is created with just a header. -# RUN: ld.lld main.o a.o b.a -o /dev/null --why-extract=why1.txt +# RUN: ld.lld main.o a.o b.a --why-extract=why1.txt # RUN: FileCheck %s --input-file=why1.txt --check-prefix=CHECK1 --match-full-lines --strict-whitespace # CHECK1:reference extracted symbol # CHECK1-NOT:{{.}} ## Some archive members are extracted. -# RUN: ld.lld main.o a_b.a b.a -o /dev/null --why-extract=why2.txt +# RUN: ld.lld main.o a_b.a b.a --why-extract=why2.txt # RUN: FileCheck %s --input-file=why2.txt --check-prefix=CHECK2 --match-full-lines --strict-whitespace ## A relocation error does not suppress the output. -# RUN: rm -f why2.txt && not ld.lld main.o a_b.a b.a err.o -o /dev/null --why-extract=why2.txt +# RUN: rm -f why2.txt && not ld.lld main.o a_b.a b.a err.o --why-extract=why2.txt # RUN: FileCheck %s --input-file=why2.txt --check-prefix=CHECK2 --match-full-lines --strict-whitespace # CHECK2:reference extracted symbol @@ -31,12 +31,12 @@ # CHECK2-NEXT:a_b.a(a_b.o) b.a(b.o) b() ## An undefined symbol error does not suppress the output. -# RUN: not ld.lld main.o a_b.a -o /dev/null --why-extract=why3.txt +# RUN: not ld.lld main.o a_b.a --why-extract=why3.txt # RUN: FileCheck %s --input-file=why3.txt --check-prefix=CHECK3 --match-full-lines --strict-whitespace ## Check that backward references are supported. ## - means stdout. -# RUN: ld.lld b.a a_b.a main.o -o /dev/null --why-extract=- | FileCheck %s --check-prefix=CHECK4 +# RUN: ld.lld b.a a_b.a main.o --why-extract=- | FileCheck %s --check-prefix=CHECK4 # CHECK3:reference extracted symbol # CHECK3-NEXT:main.o a_b.a(a_b.o) a @@ -45,34 +45,34 @@ # CHECK4-NEXT:a_b.a(a_b.o) b.a(b.o) b() # CHECK4-NEXT:main.o a_b.a(a_b.o) a -# RUN: ld.lld main.o a_b.a b.a -o /dev/null --no-demangle --why-extract=- | FileCheck %s --check-prefix=MANGLED +# RUN: ld.lld main.o a_b.a b.a --no-demangle --why-extract=- | FileCheck %s --check-prefix=MANGLED # MANGLED: a_b.a(a_b.o) b.a(b.o) _Z1bv -# RUN: ld.lld main.o a.a b.a -o /dev/null -u _Z1bv --why-extract=- | FileCheck %s --check-prefix=UNDEFINED +# RUN: ld.lld main.o a.a b.a -u _Z1bv --why-extract=- | FileCheck %s --check-prefix=UNDEFINED ## We insert -u symbol before processing other files, so its name is . ## This is not ideal. # UNDEFINED: b.a(b.o) b() -# RUN: ld.lld main.o a.a b.a -o /dev/null --undefined-glob '_Z1b*' --why-extract=- | FileCheck %s --check-prefix=UNDEFINED_GLOB +# RUN: ld.lld main.o a.a b.a --undefined-glob '_Z1b*' --why-extract=- | FileCheck %s --check-prefix=UNDEFINED_GLOB # UNDEFINED_GLOB: --undefined-glob b.a(b.o) b() -# RUN: ld.lld main.o a.a b.a -o /dev/null -e _Z1bv --why-extract=- | FileCheck %s --check-prefix=ENTRY +# RUN: ld.lld main.o a.a b.a -e _Z1bv --why-extract=- | FileCheck %s --check-prefix=ENTRY # ENTRY: --entry b.a(b.o) b() -# RUN: ld.lld main.o b.a -o /dev/null -T a.lds --why-extract=- | FileCheck %s --check-prefix=SCRIPT +# RUN: ld.lld main.o b.a -T a.lds --why-extract=- | FileCheck %s --check-prefix=SCRIPT # SCRIPT: b.a(b.o) b() -# RUN: ld.lld main.o --start-lib a_b.o b.o --end-lib -o /dev/null --why-extract=- | FileCheck %s --check-prefix=LAZY +# RUN: ld.lld main.o --start-lib a_b.o b.o --end-lib --why-extract=- | FileCheck %s --check-prefix=LAZY # LAZY: main.o a_b.o a # LAZY: a_b.o b.o b() -# RUN: not ld.lld -shared main.o -o /dev/null --why-extract=/ 2>&1 | FileCheck %s --check-prefix=ERR +# RUN: not ld.lld -shared main.o --why-extract=/ 2>&1 | FileCheck %s --check-prefix=ERR # ERR: error: cannot open --why-extract= file /: {{.*}} From 8ecd78832437d45c269a361d9360abf0de92984f Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 19 Feb 2025 17:26:25 +0000 Subject: [PATCH 088/220] Remove header file spuriously added by 9905728e2fb4ebe9b7518dfd73a0574eea0a2083. --- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 62e1c4c3ed3b1..d725a457aeff6 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -32,7 +32,6 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" -#include "llvm/Support/Casting.h" #include #include #include From aa847ced0721bcfc411b8827e54f1681edb9cc8c Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Wed, 19 Feb 2025 18:29:47 +0100 Subject: [PATCH 089/220] [InstCombine] handle trunc to i1 in foldSelectICmpAndBinOp (#127390) for `trunc nuw` saves a instruction and otherwise only other instructions without the select, same behavior as for bit test before. proof: https://alive2.llvm.org/ce/z/a6QmyV --- .../InstCombine/InstCombineSelect.cpp | 62 +++++++++++-------- .../InstCombine/select-with-bitwise-ops.ll | 29 +++++---- 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 1b80e3555fbea..e621a0b7fe596 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -746,39 +746,47 @@ static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal, /// 1. The icmp predicate is inverted /// 2. The select operands are reversed /// 3. The magnitude of C2 and C1 are flipped -static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, - Value *FalseVal, - InstCombiner::BuilderTy &Builder) { +static Value *foldSelectICmpAndBinOp(Value *CondVal, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { // Only handle integer compares. Also, if this is a vector select, we need a // vector compare. if (!TrueVal->getType()->isIntOrIntVectorTy() || - TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy()) + TrueVal->getType()->isVectorTy() != CondVal->getType()->isVectorTy()) return nullptr; - Value *CmpLHS = IC->getOperand(0); - Value *CmpRHS = IC->getOperand(1); - unsigned C1Log; bool NeedAnd = false; - CmpInst::Predicate Pred = IC->getPredicate(); - if (IC->isEquality()) { - if (!match(CmpRHS, m_Zero())) - return nullptr; + CmpPredicate Pred; + Value *CmpLHS, *CmpRHS; - const APInt *C1; - if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) - return nullptr; + if (match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)))) { + if (ICmpInst::isEquality(Pred)) { + if (!match(CmpRHS, m_Zero())) + return nullptr; - C1Log = C1->logBase2(); - } else { - auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred); - if (!Res || !Res->Mask.isPowerOf2()) - return nullptr; + const APInt *C1; + if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) + return nullptr; - CmpLHS = Res->X; - Pred = Res->Pred; - C1Log = Res->Mask.logBase2(); - NeedAnd = true; + C1Log = C1->logBase2(); + } else { + auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred); + if (!Res || !Res->Mask.isPowerOf2()) + return nullptr; + + CmpLHS = Res->X; + Pred = Res->Pred; + C1Log = Res->Mask.logBase2(); + NeedAnd = true; + } + } else if (auto *Trunc = dyn_cast(CondVal)) { + CmpLHS = Trunc->getOperand(0); + C1Log = 0; + Pred = ICmpInst::ICMP_NE; + NeedAnd = !Trunc->hasNoUnsignedWrap(); + } else { + return nullptr; } Value *Y, *V = CmpLHS; @@ -812,7 +820,7 @@ static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, // Make sure we don't create more instructions than we save. if ((NeedShift + NeedXor + NeedZExtTrunc + NeedAnd) > - (IC->hasOneUse() + BinOp->hasOneUse())) + (CondVal->hasOneUse() + BinOp->hasOneUse())) return nullptr; if (NeedAnd) { @@ -1990,9 +1998,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, if (Instruction *V = foldSelectZeroOrOnes(ICI, TrueVal, FalseVal, Builder)) return V; - if (Value *V = foldSelectICmpAndBinOp(ICI, TrueVal, FalseVal, Builder)) - return replaceInstUsesWith(SI, V); - if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder)) return replaceInstUsesWith(SI, V); @@ -3950,6 +3955,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { if (Instruction *Result = foldSelectInstWithICmp(SI, ICI)) return Result; + if (Value *V = foldSelectICmpAndBinOp(CondVal, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + if (Instruction *Add = foldAddSubSelect(SI, Builder)) return Add; if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder)) diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll index 67dec9178eeca..ca2e23c1d082e 100644 --- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll +++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll @@ -1754,9 +1754,9 @@ define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) { define i8 @select_trunc_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc i8 %x to i1 @@ -1767,9 +1767,9 @@ define i8 @select_trunc_or_2(i8 %x, i8 %y) { define i8 @select_not_trunc_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_not_trunc_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc i8 %x to i1 @@ -1781,9 +1781,8 @@ define i8 @select_not_trunc_or_2(i8 %x, i8 %y) { define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_nuw_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc nuw i8 %x to i1 @@ -1794,9 +1793,9 @@ define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) { define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_nsw_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc nsw i8 %x to i1 @@ -1807,9 +1806,9 @@ define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) { define <2 x i8> @select_trunc_or_2_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @select_trunc_or_2_vec( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc <2 x i8> [[X:%.*]] to <2 x i1> -; CHECK-NEXT: [[OR:%.*]] = or <2 x i8> [[Y:%.*]], splat (i8 2) -; CHECK-NEXT: [[SELECT:%.*]] = select <2 x i1> [[TRUNC]], <2 x i8> [[OR]], <2 x i8> [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], splat (i8 1) +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], splat (i8 2) +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i8> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[SELECT]] ; %trunc = trunc <2 x i8> %x to <2 x i1> From e1d1bb93d208c5772c385549c7efaa3d83459d2e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 19 Feb 2025 09:38:48 -0800 Subject: [PATCH 090/220] [ELF,test] Clean up aarch64-relocs.s --- lld/test/ELF/aarch64-relocs.s | 95 +++++++++++++---------------------- 1 file changed, 35 insertions(+), 60 deletions(-) diff --git a/lld/test/ELF/aarch64-relocs.s b/lld/test/ELF/aarch64-relocs.s index 198674c085b54..39cfcdd38661d 100644 --- a/lld/test/ELF/aarch64-relocs.s +++ b/lld/test/ELF/aarch64-relocs.s @@ -25,12 +25,10 @@ mystr: .asciz "blah" .size mystr, 4 -# PAGE(S + A) - PAGE(P) = PAGE(210136) - PAGE(0x210132) = 0 -# # CHECK: Disassembly of section .R_AARCH64_ADR_PREL_PG_HI21: # CHECK-EMPTY: # CHECK-NEXT: <.R_AARCH64_ADR_PREL_PG_HI21>: -# CHECK-NEXT: 210132: 90000001 adrp x1, 0x210000 +# CHECK-NEXT: adrp x1, 0x210000 .section .R_AARCH64_ADD_ABS_LO12_NC,"ax",@progbits add x0, x0, :lo12:.L.str @@ -64,39 +62,16 @@ foo: nop sub: nop - -# CHECK: Disassembly of section .SUB: -# CHECK-EMPTY: -# CHECK-NEXT: <.SUB>: -# CHECK-NEXT: 21014c: d503201f nop -# CHECK: : -# CHECK-NEXT: 210150: d503201f nop - .section .R_AARCH64_CALL26,"ax",@progbits call26: bl sub + b sub -# S = 0x21014c, A = 0x4, P = 0x210154 -# R = S + A - P = -0x4 = 0xfffffffc -# (R & 0x0ffffffc) >> 2 = 0x03ffffff -# 0x94000000 | 0x03ffffff = 0x97ffffff # CHECK: Disassembly of section .R_AARCH64_CALL26: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: 210154: 97ffffff bl 0x210150 - -.section .R_AARCH64_JUMP26,"ax",@progbits -jump26: - b sub - -# S = 0x21014c, A = 0x4, P = 0x210158 -# R = S + A - P = -0x8 = 0xfffffff8 -# (R & 0x0ffffffc) >> 2 = 0x03fffffe -# 0x14000000 | 0x03fffffe = 0x17fffffe -# CHECK: Disassembly of section .R_AARCH64_JUMP26: -# CHECK-EMPTY: -# CHECK-NEXT: : -# CHECK-NEXT: 210158: 17fffffe b 0x210150 +# CHECK-NEXT: bl {{.*}} +# CHECK-NEXT: b {{.*}} .section .R_AARCH64_LDST32_ABS_LO12_NC,"ax",@progbits ldst32: @@ -179,14 +154,14 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_UABS: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: f280018c movk x12, #12 -# CHECK-NEXT: f280018c movk x12, #12 -# CHECK-NEXT: f2a001ad movk x13, #13, lsl #16 -# CHECK-NEXT: f2a001ad movk x13, #13, lsl #16 -# CHECK-NEXT: f2c001ce movk x14, #14, lsl #32 -# CHECK-NEXT: f2c001ce movk x14, #14, lsl #32 -# CHECK-NEXT: d2e001ef mov x15, #4222124650659840 -# CHECK-NEXT: f2e001f0 movk x16, #15, lsl #48 +# CHECK-NEXT: movk x12, #12 +# CHECK-NEXT: movk x12, #12 +# CHECK-NEXT: movk x13, #13, lsl #16 +# CHECK-NEXT: movk x13, #13, lsl #16 +# CHECK-NEXT: movk x14, #14, lsl #32 +# CHECK-NEXT: movk x14, #14, lsl #32 +# CHECK-NEXT: mov x15, #4222124650659840 +# CHECK-NEXT: movk x16, #15, lsl #48 .section .R_AARCH64_MOVW_SABS,"ax",@progbits movz x1, #:abs_g0_s:zero+1 @@ -199,15 +174,15 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_SABS: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: d2800021 mov x1, #1 -# CHECK-NEXT: 92800001 mov x1, #-1 -# CHECK-NEXT: d2a00042 mov x2, #131072 +# CHECK-NEXT: mov x1, #1 +# CHECK-NEXT: mov x1, #-1 +# CHECK-NEXT: mov x2, #131072 ## -65537 = 0xfffffffffffeffff -# CHECK-NEXT: 92a00022 mov x2, #-65537 +# CHECK-NEXT: mov x2, #-65537 ## 12884901888 = 0x300000000 -# CHECK-NEXT: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #12884901888 ## -8589934593 = #0xfffffffdffffffff -# CHECK-NEXT: 92c00043 mov x3, #-8589934593 +# CHECK-NEXT: mov x3, #-8589934593 .section .R_AARCH64_MOVW_PREL,"ax",@progbits movz x1, #:prel_g0:.+1 @@ -231,24 +206,24 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_PREL: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: 2101bc: d2800021 mov x1, #1 -# CHECK-NEXT: 2101c0: 92800001 mov x1, #-1 -# CHECK-NEXT: 2101c4: f2800021 movk x1, #1 -# CHECK-NEXT: 2101c8: f29fffe1 movk x1, #65535 -# CHECK-NEXT: 2101cc: d2a00042 mov x2, #131072 +# CHECK-NEXT: mov x1, #1 +# CHECK-NEXT: mov x1, #-1 +# CHECK-NEXT: movk x1, #1 +# CHECK-NEXT: movk x1, #65535 +# CHECK-NEXT: mov x2, #131072 ## -65537 = 0xfffffffffffeffff -# CHECK-NEXT: 2101d0: 92a00022 mov x2, #-65537 -# CHECK-NEXT: 2101d4: f2a00042 movk x2, #2, lsl #16 -# CHECK-NEXT: 2101d8: f2bfffc2 movk x2, #65534, lsl #16 +# CHECK-NEXT: mov x2, #-65537 +# CHECK-NEXT: movk x2, #2, lsl #16 +# CHECK-NEXT: movk x2, #65534, lsl #16 ## 12884901888 = 0x300000000 -# CHECK-NEXT: 2101dc: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #12884901888 ## -8589934593 = #0xfffffffdffffffff -# CHECK-NEXT: 2101e0: 92c00043 mov x3, #-8589934593 -# CHECK-NEXT: 2101e4: f2c00063 movk x3, #3, lsl #32 -# CHECK-NEXT: 2101e8: f2dfffa3 movk x3, #65533, lsl #32 -# CHECK-NEXT: 2101ec: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #-8589934593 +# CHECK-NEXT: movk x3, #3, lsl #32 +# CHECK-NEXT: movk x3, #65533, lsl #32 +# CHECK-NEXT: mov x3, #12884901888 ## 1125899906842624 = 0x4000000000000 -# CHECK-NEXT: 2101f0: d2e00084 mov x4, #1125899906842624 -# CHECK-NEXT: 2101f4: d2ffff84 mov x4, #-1125899906842624 -# CHECK-NEXT: 2101f8: f2e00084 movk x4, #4, lsl #48 -# CHECK-NEXT: 2101fc: f2ffff84 movk x4, #65532, lsl #48 +# CHECK-NEXT: mov x4, #1125899906842624 +# CHECK-NEXT: mov x4, #-1125899906842624 +# CHECK-NEXT: movk x4, #4, lsl #48 +# CHECK-NEXT: movk x4, #65532, lsl #48 From 3e8db13ced157995d681ee067a121061afcdf808 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 19 Feb 2025 09:24:14 -0800 Subject: [PATCH 091/220] [SLP][NFC]Replace undefs by zeroinitializer --- .../SLPVectorizer/X86/crash_7zip.ll | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll index c029781142af3..ae851e3319e1f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -13,17 +13,18 @@ define fastcc void @LzmaDec_DecodeReal2(ptr %p, i1 %arg) { ; CHECK-NEXT: [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], ptr [[P:%.*]], i64 0, i32 4 ; CHECK-NEXT: br label [[DO_BODY66_I:%.*]] ; CHECK: do.body66.i: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]] -; CHECK-NEXT: br i1 %arg, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] ; CHECK: if.else.i: -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], undef +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: br label [[DO_COND_I]] ; CHECK: do.cond.i: -; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP1]], [[DO_BODY66_I]] ] -; CHECK-NEXT: br i1 %arg, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] +; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP5]], [[DO_BODY66_I]] ] +; CHECK-NEXT: br i1 [[ARG]], label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] ; CHECK: do.end1006.i: -; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP3]] ; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[RANGE20_I]], align 4 ; CHECK-NEXT: ret void ; @@ -33,25 +34,25 @@ entry: br label %do.body66.i do.body66.i: ; preds = %do.cond.i, %entry - %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ] - %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ] - %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i - %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i + %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ zeroinitializer, %entry ] + %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ zeroinitializer, %entry ] + %.range.2.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %range.2.i + %.code.2.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %code.2.i br i1 %arg, label %do.cond.i, label %if.else.i if.else.i: ; preds = %do.body66.i - %sub91.i = sub i32 %.range.2.i, undef - %sub92.i = sub i32 %.code.2.i, undef + %sub91.i = sub i32 %.range.2.i, zeroinitializer + %sub92.i = sub i32 %.code.2.i, zeroinitializer br label %do.cond.i do.cond.i: ; preds = %if.else.i, %do.body66.i - %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ] + %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ zeroinitializer, %do.body66.i ] %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ] br i1 %arg, label %do.body66.i, label %do.end1006.i do.end1006.i: ; preds = %do.cond.i - %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i - %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i + %.range.4.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %range.4.i + %.code.4.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %code.4.i store i32 %.range.4.i, ptr %range20.i, align 4 store i32 %.code.4.i, ptr %code21.i, align 4 ret void From ad87d5f23d921bd4a8d9677f7db563c649a1f5c3 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Wed, 19 Feb 2025 12:44:44 -0500 Subject: [PATCH 092/220] [libc++][test] Refactor tests for std::{copy, move, fill} algorithms (#120909) This refactor includes the following changes: - Refactor similar tests using `types::for_each` to remove redundant code; - Explicitly include the missing header `type_algorithms.h` in some test files; - Some tests scattered in different test functions with ad-hoc names (e.g., `test5()`, `test6()`) but belong to the same kind are now grouped into one function (`test_struct_array()`). --- .../alg.copy/copy.pass.cpp | 4 +- .../alg.copy/copy_backward.pass.cpp | 62 +++--- .../alg.copy/copy_if.pass.cpp | 83 +++----- .../alg.copy/copy_n.pass.cpp | 104 ++++------ .../alg.copy/pstl.copy.pass.cpp | 1 + .../alg.copy/pstl.copy_n.pass.cpp | 3 +- .../alg.copy/ranges.copy_n.pass.cpp | 39 +--- .../alg.fill/fill.pass.cpp | 1 + .../alg.fill/fill_n.pass.cpp | 186 +++++++++--------- .../alg.fill/pstl.fill.pass.cpp | 1 + .../alg.fill/pstl.fill_n.pass.cpp | 1 + .../alg.move/move.pass.cpp | 7 +- .../alg.move/move_backward.pass.cpp | 7 +- .../alg.move/pstl.move.pass.cpp | 1 + 14 files changed, 204 insertions(+), 296 deletions(-) diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp index 1ca397c92a334..3d4ee23a5a7ff 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp @@ -18,6 +18,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -81,7 +82,7 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } TEST_CONSTEXPR_CXX20 bool test() { - types::for_each(types::cpp17_input_iterator_list(), TestInIters()); + types::for_each(types::cpp17_input_iterator_list(), TestInIters()); { // Make sure that padding bits aren't copied Derived src(1, 2, 3); @@ -91,7 +92,6 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::copy(a + 3, a + 10, a); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp index 445c7718e1111..8a528a96f5294 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp @@ -19,6 +19,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" class PaddedBase { @@ -36,21 +37,29 @@ class Derived : public PaddedBase { std::int8_t c_; }; -template -TEST_CONSTEXPR_CXX20 void test_copy_backward() { - { - const unsigned N = 1000; - int ia[N] = {}; - for (unsigned i = 0; i < N; ++i) - ia[i] = i; - int ib[N] = {0}; - - OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); - assert(base(r) == ib); - for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); +struct TestIterators { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each(types::bidirectional_iterator_list(), TestImpl()); } -} + + template + struct TestImpl { + template + TEST_CONSTEXPR_CXX20 void operator()() { + const unsigned N = 1000; + int ia[N] = {}; + for (unsigned i = 0; i < N; ++i) + ia[i] = i; + int ib[N] = {0}; + + OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); + assert(base(r) == ib); + for (unsigned i = 0; i < N; ++i) + assert(ia[i] == ib[i]); + } + }; +}; TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -70,31 +79,10 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } return true; -}; +} TEST_CONSTEXPR_CXX20 bool test() { - test_copy_backward, bidirectional_iterator >(); - test_copy_backward, random_access_iterator >(); - test_copy_backward, int*>(); - - test_copy_backward, bidirectional_iterator >(); - test_copy_backward, random_access_iterator >(); - test_copy_backward, int*>(); - - test_copy_backward >(); - test_copy_backward >(); - test_copy_backward(); - -#if TEST_STD_VER > 17 - test_copy_backward, bidirectional_iterator>(); - test_copy_backward, random_access_iterator>(); - test_copy_backward, int*>(); - - test_copy_backward, contiguous_iterator>(); - test_copy_backward, contiguous_iterator>(); - test_copy_backward, contiguous_iterator>(); - test_copy_backward>(); -#endif + types::for_each(types::bidirectional_iterator_list(), TestIterators()); { // Make sure that padding bits aren't copied Derived src(1, 2, 3); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp index 57214e65455b4..3bee77738e342 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp @@ -19,75 +19,48 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" -struct Pred -{ - TEST_CONSTEXPR_CXX14 bool operator()(int i) {return i % 3 == 0;} +struct Pred { + TEST_CONSTEXPR_CXX14 bool operator()(int i) { return i % 3 == 0; } }; -template -TEST_CONSTEXPR_CXX20 void -test_copy_if() -{ +template +struct TestOutIters { + template + TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::copy_if(InIter(ia), InIter(ia+N), OutIter(ib), Pred()); - assert(base(r) == ib+N/3+1); - for (unsigned i = 0; i < N/3+1; ++i) - assert(ib[i] % 3 == 0); -} - -TEST_CONSTEXPR_CXX20 bool -test() -{ - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); + OutIter r = std::copy_if(InIter(ia), InIter(ia + N), OutIter(ib), Pred()); + assert(base(r) == ib + N / 3 + 1); + for (unsigned i = 0; i < N / 3 + 1; ++i) + assert(ib[i] % 3 == 0); + } +}; - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if(); +struct TestInIters { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each( + types::concatenate_t, types::type_list > >(), + TestOutIters()); + } +}; +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::cpp17_input_iterator_list(), TestInIters()); return true; } -int main(int, char**) -{ - test(); +int main(int, char**) { + test(); #if TEST_STD_VER > 17 - static_assert(test()); + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp index 889e71f4eceb9..2053134a01a2f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp @@ -18,6 +18,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" typedef UserDefinedIntegral UDI; @@ -37,37 +38,31 @@ class Derived : public PaddedBase { std::int8_t c_; }; -template -TEST_CONSTEXPR_CXX20 void test_copy_n() { - { - const unsigned N = 1000; - int ia[N] = {}; - for (unsigned i = 0; i < N; ++i) - ia[i] = i; - int ib[N] = {0}; - - OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib)); - assert(base(r) == ib + N / 2); - for (unsigned i = 0; i < N / 2; ++i) - assert(ia[i] == ib[i]); +struct TestIterators { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each( + types::concatenate_t, types::type_list > >(), + TestImpl()); } - { // Make sure that padding bits aren't copied - Derived src(1, 2, 3); - Derived dst(4, 5, 6); - std::copy_n(static_cast(&src), 1, static_cast(&dst)); - assert(dst.a_ == 1); - assert(dst.b_ == 2); - assert(dst.c_ == 6); - } - - { // Make sure that overlapping ranges can be copied - int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - std::copy_n(a + 3, 7, a); - int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; - assert(std::equal(a, a + 10, expected)); - } -} + template + struct TestImpl { + template + TEST_CONSTEXPR_CXX20 void operator()() { + const unsigned N = 1000; + int ia[N] = {}; + for (unsigned i = 0; i < N; ++i) + ia[i] = i; + int ib[N] = {0}; + + OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib)); + assert(base(r) == ib + N / 2); + for (unsigned i = 0; i < N / 2; ++i) + assert(ia[i] == ib[i]); + } + }; +}; TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -90,40 +85,23 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } TEST_CONSTEXPR_CXX20 bool test() { - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n(); + types::for_each(types::cpp17_input_iterator_list(), TestIterators()); + + { // Make sure that padding bits aren't copied + Derived src(1, 2, 3); + Derived dst(4, 5, 6); + std::copy_n(static_cast(&src), 1, static_cast(&dst)); + assert(dst.a_ == 1); + assert(dst.b_ == 2); + assert(dst.c_ == 6); + } + + { // Make sure that overlapping ranges can be copied + int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + std::copy_n(a + 3, 7, a); + int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; + assert(std::equal(a, a + 10, expected)); + } { // Test vector::iterator optimization assert(test_vector_bool(8)); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp index bee1ef9bcec33..6229aac733a9c 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(copy); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp index 128108ac13811..7208be75c70d0 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(copy_n); @@ -58,7 +59,7 @@ struct TestIteratorsInt { }; struct CopiedToTester { - bool copied_to = false; + bool copied_to = false; CopiedToTester() = default; CopiedToTester(const CopiedToTester&) {} CopiedToTester& operator=(const CopiedToTester&) { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp index c7031f63a02f6..577328d663d9f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp @@ -40,7 +40,7 @@ static_assert(!HasCopyNIt, std::ranges::in_out_result>); -template +template constexpr void test_iterators() { { // simple test std::array in{1, 2, 3, 4}; @@ -61,26 +61,6 @@ constexpr void test_iterators() { } } -template -constexpr void test_in_iterators() { - test_iterators, Out, sentinel_wrapper>>(); - test_iterators, Out>(); - test_iterators, Out>(); - test_iterators, Out>(); - test_iterators, Out>(); -} - -template -constexpr void test_proxy_in_iterators() { - test_iterators>, - Out, - sentinel_wrapper>>>(); - test_iterators>, Out>(); - test_iterators>, Out>(); - test_iterators>, Out>(); - test_iterators>, Out>(); -} - #if TEST_STD_VER >= 23 constexpr bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -104,17 +84,12 @@ constexpr bool test_vector_bool(std::size_t N) { #endif constexpr bool test() { - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); + types::for_each(types::cpp20_input_iterator_list{}, []() { + types::for_each(types::cpp20_input_iterator_list{}, []() { + test_iterators(); + test_iterators, ProxyIterator>(); + }); + }); { // check that every element is copied exactly once struct CopyOnce { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp index 7656be73c14c6..0e532ae834e7f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp @@ -22,6 +22,7 @@ #include "sized_allocator.h" #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" template TEST_CONSTEXPR_CXX20 void diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp index 3b67101a8b29e..98c412fb6cdc0 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp @@ -14,108 +14,93 @@ // fill_n(Iter first, Size n, const T& value); #include +#include #include +#include #include #include "sized_allocator.h" #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" -#if TEST_STD_VER > 17 -TEST_CONSTEXPR bool test_constexpr() { - const std::size_t N = 5; - int ib[] = {0, 0, 0, 0, 0, 0}; // one bigger than N - - auto it = std::fill_n(std::begin(ib), N, 5); - return it == (std::begin(ib) + N) && std::all_of(std::begin(ib), it, [](int a) { return a == 5; }) && - *it == 0 // don't overwrite the last value in the output array - ; -} -#endif - typedef UserDefinedIntegral UDI; -template -void test_char() { - char a[4] = {}; - Iter it = std::fill_n(Iter(a), UDI(4), char(1)); - assert(base(it) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); +template +TEST_CONSTEXPR_CXX20 void +test(Container in, size_t from, size_t n, typename Container::value_type value, Container expected) { + Iter it = std::fill_n(Iter(in.data() + from), UDI(n), value); + assert(base(it) == in.data() + from + n); + assert(in == expected); } -template -void test_int() { - int a[4] = {}; - Iter it = std::fill_n(Iter(a), UDI(4), 1); - assert(base(it) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); -} +template +struct Test { + template + TEST_CONSTEXPR_CXX20 void operator()() { + { + std::array in = {1, 2, 3, 4}; + std::array expected = {5, 5, 5, 5}; + test(in, 0, 4, 5, expected); + } + { + std::array in = {1, 2, 3, 4}; + std::array expected = {1, 5, 5, 4}; + test(in, 1, 2, 5, expected); + } + } +}; -void test_int_array() { - int a[4] = {}; - assert(std::fill_n(a, UDI(4), static_cast(1)) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); +TEST_CONSTEXPR_CXX20 void test_int_array() { + { + int a[4] = {}; + assert(std::fill_n(a, UDI(4), static_cast(1)) == a + 4); + assert(a[0] == 1 && a[1] == 1 && a[2] == 1 && a[3] == 1); + } +#if TEST_STD_VER >= 11 + { + const std::size_t N = 5; + int ib[] = {0, 0, 0, 0, 0, 0}; // one bigger than N + + auto it = std::fill_n(std::begin(ib), N, 5); + assert(it == (std::begin(ib) + N) && std::all_of(std::begin(ib), it, [](int a) { return a == 5; }) && + *it == 0 // don't overwrite the last value in the output array + ); + } +#endif } struct source { - source() : i(0) {} - - operator int() const { return i++; } - mutable int i; + TEST_CONSTEXPR source() = default; + TEST_CONSTEXPR_CXX20 operator int() const { return 1; } }; -void test_int_array_struct_source() { +TEST_CONSTEXPR_CXX20 void test_int_array_struct_source() { int a[4] = {}; assert(std::fill_n(a, UDI(4), source()) == a + 4); - assert(a[0] == 0); + assert(a[0] == 1); assert(a[1] == 1); - assert(a[2] == 2); - assert(a[3] == 3); -} - -struct test1 { - test1() : c(0) {} - test1(char xc) : c(xc + 1) {} - char c; -}; - -void test_struct_array() { - test1 test1a[4] = {}; - assert(std::fill_n(test1a, UDI(4), static_cast(10)) == test1a + 4); - assert(test1a[0].c == 11); - assert(test1a[1].c == 11); - assert(test1a[2].c == 11); - assert(test1a[3].c == 11); + assert(a[2] == 1); + assert(a[3] == 1); } class A { char a_; public: - A() {} - explicit A(char a) : a_(a) {} - operator unsigned char() const { return 'b'; } + TEST_CONSTEXPR A() : a_('a') {}; + TEST_CONSTEXPR explicit A(char a) : a_(a) {} + TEST_CONSTEXPR operator unsigned char() const { return 'b'; } - friend bool operator==(const A& x, const A& y) { return x.a_ == y.a_; } + TEST_CONSTEXPR friend bool operator==(const A& x, const A& y) { return x.a_ == y.a_; } }; -void test5() { - A a[3]; - assert(std::fill_n(&a[0], UDI(3), A('a')) == a + 3); - assert(a[0] == A('a')); - assert(a[1] == A('a')); - assert(a[2] == A('a')); -} +struct B { + TEST_CONSTEXPR B() : c(0) {} + TEST_CONSTEXPR B(char xc) : c(xc + 1) {} + char c; +}; struct Storage { union { @@ -124,11 +109,6 @@ struct Storage { }; }; -void test6() { - Storage foo[5]; - std::fill_n(&foo[0], UDI(5), Storage()); -} - // Make sure std::fill_n behaves properly with std::vector iterators with custom size types. // See https://github.com/llvm/llvm-project/pull/122410. TEST_CONSTEXPR_CXX20 void test_bititer_with_custom_sized_types() { @@ -162,30 +142,44 @@ TEST_CONSTEXPR_CXX20 void test_bititer_with_custom_sized_types() { } } -int main(int, char**) { - test_char >(); - test_char >(); - test_char >(); - test_char >(); - test_char(); - - test_int >(); - test_int >(); - test_int >(); - test_int >(); - test_int(); +TEST_CONSTEXPR_CXX20 void test_struct_array() { + { + A a[3]; + assert(std::fill_n(&a[0], UDI(3), A('a')) == a + 3); + assert(a[0] == A('a')); + assert(a[1] == A('a')); + assert(a[2] == A('a')); + } + { + B b[4] = {}; + assert(std::fill_n(b, UDI(4), static_cast(10)) == b + 4); + assert(b[0].c == 11); + assert(b[1].c == 11); + assert(b[2].c == 11); + assert(b[3].c == 11); + } + { + Storage foo[5]; + std::fill_n(&foo[0], UDI(5), Storage()); + } +} + +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::forward_iterator_list(), Test()); + types::for_each(types::forward_iterator_list(), Test()); test_int_array(); - test_int_array_struct_source(); test_struct_array(); - - test5(); - test6(); - + test_int_array_struct_source(); test_bititer_with_custom_sized_types(); -#if TEST_STD_VER > 17 - static_assert(test_constexpr()); + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 20 + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp index 556326fb0894c..e456fa8986aad 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(fill); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp index 4abbd6f7a17c3..51232dfef1606 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(fill_n); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp index 1afaa1a7e6da1..e28484ee4984b 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp @@ -25,6 +25,7 @@ #include "MoveOnly.h" #include "test_iterators.h" #include "test_macros.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -119,7 +120,6 @@ TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::cpp17_input_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) types::for_each(types::cpp17_input_iterator_list*>(), Test1OutIters()); - { // Make sure that padding bits aren't copied Derived src(1, 2, 3); Derived dst(4, 5, 6); @@ -128,16 +128,13 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::move(a + 3, a + 10, a); int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; assert(std::equal(a, a + 10, expected)); } - - // Make sure that the algorithm works with move-only types - { + { // Make sure that the algorithm works with move-only types // When non-trivial { MoveOnly from[3] = {1, 2, 3}; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp index 3c0fcadb2d036..d8b7e68b155d6 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp @@ -24,6 +24,7 @@ #include "MoveOnly.h" #include "test_iterators.h" #include "test_macros.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -115,7 +116,6 @@ TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::bidirectional_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) types::for_each(types::bidirectional_iterator_list*>(), Test1OutIters()); - { // Make sure that padding bits aren't copied Derived src(1, 2, 3); Derived dst(4, 5, 6); @@ -125,16 +125,13 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::move_backward(a, a + 7, a + 10); int expected[] = {1, 2, 3, 1, 2, 3, 4, 5, 6, 7}; assert(std::equal(a, a + 10, expected)); } - - // Make sure that the algorithm works with move-only types - { + { // Make sure that the algorithm works with move-only types // When non-trivial { MoveOnly from[3] = {1, 2, 3}; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp index e4cc5649ce5d8..a82a068caf031 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(move); From 5450954a06425c6e50261d9c64778706a36f2cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Horv=C3=A1th?= Date: Wed, 19 Feb 2025 17:45:05 +0000 Subject: [PATCH 093/220] Include test folder in the Clang Static Analyzer team mentions (#127810) See https://discourse.llvm.org/t/taking-ownership-of-clang-test-analysis/84689 --- .github/new-prs-labeler.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 9863ff087ca86..c375fa5dc7516 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -499,6 +499,7 @@ clang:static analyzer: - clang/tools/scan-build/** - clang/utils/analyzer/** - clang/docs/analyzer/** + - clang/test/Analysis/** pgo: - llvm/lib/Transforms/Instrumentation/CGProfile.cpp From a96444af440a309592fe1043885a51ac0a5fb125 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 19 Feb 2025 18:48:19 +0100 Subject: [PATCH 094/220] [VPlan] Remove dead exit block handling code in HCFGBuilder. The mapping of IR ExitBB to a VPBB isn't used. It also sets an incorrect VPBB for the ExitBB; the regions successor is the middle block, no the exit block. It also unnecessarily triggers an assertion after 38376dee922. --- .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 18 ---- .../Transforms/LoopVectorize/loop-form.ll | 90 +++++++++++++++++++ 2 files changed, 90 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 22c2f91ff55f6..cf0ba6fa54700 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -366,24 +366,6 @@ void PlainCFGBuilder::buildPlainCFG( // latter. BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB; Loop2Region[LI->getLoopFor(TheLoop->getHeader())] = TheRegion; - BasicBlock *ExitBB = TheLoop->getUniqueExitBlock(); - if (!ExitBB) { - // If there is no unique exit block, we must exit via the latch. This exit - // is mapped to the middle block in the input plan. - BasicBlock *Latch = TheLoop->getLoopLatch(); - auto *Br = cast(Latch->getTerminator()); - if (TheLoop->contains(Br->getSuccessor(0))) { - assert(!TheLoop->contains(Br->getSuccessor(1)) && - "latch must exit the loop"); - ExitBB = Br->getSuccessor(1); - } else { - assert(!TheLoop->contains(Br->getSuccessor(0)) && - "latch must exit the loop"); - ExitBB = Br->getSuccessor(0); - } - } - assert(ExitBB && "Must have a unique exit block or also exit via the latch."); - BB2VPBB[ExitBB] = cast(TheRegion->getSingleSuccessor()); // The existing vector region's entry and exiting VPBBs correspond to the loop // header and latch. diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index 730d488119d13..4a9380b3f35e8 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1321,3 +1321,93 @@ exit: ret i32 %accum } +define i16 @multiple_exit_none_via_latch(ptr %dst, i64 %x) { +; CHECK-LABEL: @multiple_exit_none_via_latch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[X:%.*]], i64 100) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 2, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP4]] +; CHECK-NEXT: store i64 0, ptr [[TMP5]], align 8 +; CHECK-NEXT: store i64 0, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 +; CHECK-NEXT: [[CMP120:%.*]] = icmp slt i64 [[IV]], 100 +; CHECK-NEXT: br i1 [[CMP120]], label [[LOOP_THEN:%.*]], label [[EXIT_2:%.*]] +; CHECK: loop.then: +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[IV]], [[X]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: br label [[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: exit.1: +; CHECK-NEXT: ret i16 0 +; CHECK: exit.2: +; CHECK-NEXT: ret i16 1 +; +; TAILFOLD-LABEL: @multiple_exit_none_via_latch( +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]] +; TAILFOLD: loop.header: +; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: store i64 0, ptr [[GEP]], align 8 +; TAILFOLD-NEXT: [[CMP120:%.*]] = icmp slt i64 [[IV]], 100 +; TAILFOLD-NEXT: br i1 [[CMP120]], label [[LOOP_THEN:%.*]], label [[EXIT_2:%.*]] +; TAILFOLD: loop.then: +; TAILFOLD-NEXT: [[CMP3:%.*]] = icmp ne i64 [[IV]], [[X:%.*]] +; TAILFOLD-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]] +; TAILFOLD: loop.latch: +; TAILFOLD-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; TAILFOLD-NEXT: br label [[LOOP_HEADER]] +; TAILFOLD: exit.1: +; TAILFOLD-NEXT: ret i16 0 +; TAILFOLD: exit.2: +; TAILFOLD-NEXT: ret i16 1 +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i64 0, ptr %gep + %cmp120 = icmp slt i64 %iv, 100 + br i1 %cmp120, label %loop.then, label %exit.2 + +loop.then: + %cmp3 = icmp ne i64 %iv, %x + br i1 %cmp3, label %loop.latch, label %exit.1 + +loop.latch: + %iv.next = add i64 %iv, 1 + br label %loop.header + +exit.1: + ret i16 0 + +exit.2: + ret i16 1 +} From d1889cf935db1c04da6d477a2476d95bae529160 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 19 Feb 2025 17:56:46 +0000 Subject: [PATCH 095/220] [X86] combineX86ShuffleChain - provide list of combined shuffle nodes, replace HasVariableMask bool arg. NFC. (#127826) Minor NFC refactor before making better variable mask combining decisions - isTargetShuffleVariableMask doesn't discriminate between fast (AND, PSHUFB etc.) and slow (VPERMV3 etc.) variable shuffles, so an opaque HasVariableMask is only of limited use. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 47 +++++++++++++------------ 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7485fc48f4132..d805a76754c71 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39580,7 +39580,7 @@ static bool matchBinaryPermuteShuffle( static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableCrossLaneMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget); @@ -39595,7 +39595,7 @@ static SDValue combineX86ShuffleChainWithExtract( /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, @@ -40064,6 +40064,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (Depth < 1) return SDValue(); + bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) { + return isTargetShuffleVariableMask(N->getOpcode()); + }); + // Depth threshold above which we can efficiently use variable mask shuffles. int VariableCrossLaneShuffleDepth = Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2; @@ -40134,9 +40138,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, - Subtarget)) + Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input lane-crossing shuffle then lower to VPERMV3, @@ -40307,8 +40310,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) + Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input shuffle then lower to VPERMV3, @@ -40346,7 +40349,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // extract_subvector(shuffle(x,y,m2),0) static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableCrossLaneMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumMaskElts = BaseMask.size(); @@ -40475,7 +40478,7 @@ static SDValue combineX86ShuffleChainWithExtract( if (SDValue WideShuffle = combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, - HasVariableMask, AllowVariableCrossLaneMask, + SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) { WideShuffle = extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); @@ -40698,7 +40701,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( // TODO: Extend this to merge multiple constant Ops and update the mask. static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, ArrayRef Mask, - bool HasVariableMask, + ArrayRef SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { unsigned SizeInBits = VT.getSizeInBits(); @@ -40720,6 +40723,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, // only used once or the combined shuffle has included a variable mask // shuffle, this is to avoid constant pool bloat. bool IsOptimizingSize = DAG.shouldOptForSize(); + bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) { + return isTargetShuffleVariableMask(N->getOpcode()); + }); if (IsOptimizingSize && !HasVariableMask && llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); })) return SDValue(); @@ -40821,7 +40827,7 @@ namespace llvm { static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, + unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!RootMask.empty() && @@ -40877,7 +40883,6 @@ static SDValue combineX86ShufflesRecursively( SmallVector OpMask; SmallVector OpInputs; APInt OpUndef, OpZero; - bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, OpZero, DAG, Depth, false)) { // Shuffle inputs must not be larger than the shuffle result. @@ -41092,7 +41097,6 @@ static SDValue combineX86ShufflesRecursively( return getOnesVector(RootVT, DAG, DL); assert(!Ops.empty() && "Shuffle with no inputs detected"); - HasVariableMask |= IsOpVariableMask; // Update the list of shuffle nodes that have been combined so far. SmallVector CombinedNodes(SrcNodes); @@ -41121,15 +41125,14 @@ static SDValue combineX86ShufflesRecursively( } if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, - HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG, - Subtarget)) + AllowCrossLaneVar, AllowPerLaneVar, DAG, Subtarget)) return Res; } } // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( - RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget)) + RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget)) return Cst; // If constant fold failed and we only have constants - then we have @@ -41231,7 +41234,7 @@ static SDValue combineX86ShufflesRecursively( // Try to combine into a single shuffle instruction. if (SDValue Shuffle = combineX86ShuffleChain( - Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return Shuffle; @@ -41250,7 +41253,7 @@ static SDValue combineX86ShufflesRecursively( // If that failed and any input is extracted then try to combine as a // shuffle with the larger type. return combineX86ShuffleChainWithExtract( - Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget); } @@ -41259,7 +41262,6 @@ static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget); } @@ -41897,7 +41899,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, + /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -42236,7 +42238,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask); if (SDValue NewMask = combineX86ShufflesConstants( ShufVT, {MaskLHS, MaskRHS}, ByteMask, - /*HasVariableMask=*/true, DAG, DL, Subtarget)) { + {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) { SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, LHS.getOperand(0), NewMask); SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, @@ -43871,7 +43873,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue NewShuffle = combineX86ShufflesRecursively( {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) @@ -51430,7 +51431,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, + /*AllowVarCrossLaneMask*/ true, /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle, N0.getOperand(1)); From ccd3defd8f7da2e825f167e488827efa0df6b62c Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Wed, 19 Feb 2025 12:57:38 -0500 Subject: [PATCH 096/220] [clangd] Avoid round-trip from SourceLocation to clangd::Range and back in SymbolCollector::handleMacros() (#127757) --- clang-tools-extra/clangd/CollectMacros.cpp | 9 ++++++--- clang-tools-extra/clangd/CollectMacros.h | 1 + clang-tools-extra/clangd/index/SymbolCollector.cpp | 7 +++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/clang-tools-extra/clangd/CollectMacros.cpp b/clang-tools-extra/clangd/CollectMacros.cpp index 96298ee3ea50a..1e7d765f0b6f1 100644 --- a/clang-tools-extra/clangd/CollectMacros.cpp +++ b/clang-tools-extra/clangd/CollectMacros.cpp @@ -18,10 +18,13 @@ namespace clang { namespace clangd { -Range MacroOccurrence::toRange(const SourceManager &SM) const { +CharSourceRange MacroOccurrence::toSourceRange(const SourceManager &SM) const { auto MainFile = SM.getMainFileID(); - return halfOpenToRange( - SM, syntax::FileRange(MainFile, StartOffset, EndOffset).toCharRange(SM)); + return syntax::FileRange(MainFile, StartOffset, EndOffset).toCharRange(SM); +} + +Range MacroOccurrence::toRange(const SourceManager &SM) const { + return halfOpenToRange(SM, toSourceRange(SM)); } void CollectMainFileMacros::add(const Token &MacroNameTok, const MacroInfo *MI, diff --git a/clang-tools-extra/clangd/CollectMacros.h b/clang-tools-extra/clangd/CollectMacros.h index e7198641d8d53..20a3fc24d759c 100644 --- a/clang-tools-extra/clangd/CollectMacros.h +++ b/clang-tools-extra/clangd/CollectMacros.h @@ -31,6 +31,7 @@ struct MacroOccurrence { // True if the occurence is used in a conditional directive, e.g. #ifdef MACRO bool InConditionalDirective; + CharSourceRange toSourceRange(const SourceManager &SM) const; Range toRange(const SourceManager &SM) const; }; diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index 1de7faf81746e..3f5633357073d 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -713,7 +713,8 @@ void SymbolCollector::handleMacros(const MainFileMacros &MacroRefsToIndex) { // Add macro references. for (const auto &IDToRefs : MacroRefsToIndex.MacroRefs) { for (const auto &MacroRef : IDToRefs.second) { - const auto &Range = MacroRef.toRange(SM); + const auto &SR = MacroRef.toSourceRange(SM); + auto Range = halfOpenToRange(SM, SR); bool IsDefinition = MacroRef.IsDefinition; Ref R; R.Location.Start.setLine(Range.start.line); @@ -726,9 +727,7 @@ void SymbolCollector::handleMacros(const MainFileMacros &MacroRefsToIndex) { if (IsDefinition) { Symbol S; S.ID = IDToRefs.first; - auto StartLoc = cantFail(sourceLocationInMainFile(SM, Range.start)); - auto EndLoc = cantFail(sourceLocationInMainFile(SM, Range.end)); - S.Name = toSourceCode(SM, SourceRange(StartLoc, EndLoc)); + S.Name = toSourceCode(SM, SR.getAsRange()); S.SymInfo.Kind = index::SymbolKind::Macro; S.SymInfo.SubKind = index::SymbolSubKind::None; S.SymInfo.Properties = index::SymbolPropertySet(); From 1d829f6a0bc9fefc489d44449c6ae4c8e509a7a3 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 19 Feb 2025 18:00:44 +0000 Subject: [PATCH 097/220] [mlir][tosa] Align check variables naming to use capitals (#127830) Move in using capital letters for variable capture in LIT Signed-off-by: Georgios Pinitas --- .../Tosa/tosa-decompose-depthwise.mlir | 42 +++++++++---------- .../Tosa/tosa-decompose-transpose-conv.mlir | 4 +- mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 18 ++++---- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir index 6562a7c2ab55c..d0c0c8456d1ca 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir @@ -31,22 +31,22 @@ func.func @depthwise_conv2d_as_mul(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1 // CHECK-LABEL: @depthwise_conv2d_as_mul_q func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<1x1x2x3xi8>, %arg2: tensor<6xi32>) -> tensor<4x10x10x6xi32> { // CHECK-DAG: %[[CONST0:.+]] = tosa.const_shape {value = dense<[4, 10, 10, 2, 1]> : tensor<5xindex> - // CHECK-DAG: %[[iZp:.+]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1x1x1xi32>} - // CHECK-DAG: %[[wZp:.+]] = "tosa.const"() <{value = dense<11> : tensor<1x1x1x1xi32>} + // CHECK-DAG: %[[INPUT_ZP:.+]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1x1x1xi32>} + // CHECK-DAG: %[[WEIGHT_ZP:.+]] = "tosa.const"() <{value = dense<11> : tensor<1x1x1x1xi32>} // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 2, 3]> : tensor<5xindex> // CHECK-DAG: %[[CONST4:.+]] = tosa.const_shape {value = dense<[4, 10, 10, 6]> : tensor<4xindex> // CHECK-DAG: %[[CONST5:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 6]> : tensor<4xindex> // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> - // CHECK: %[[rIn:.+]] = tosa.reshape %arg0, %[[CONST0]] - // CHECK: %[[cIn:.+]] = tosa.cast %[[rIn]] : (tensor<4x10x10x2x1xi8>) -> tensor<4x10x10x2x1xi32> - // CHECK: %[[cWe:.+]] = tosa.cast %arg1 : (tensor<1x1x2x3xi8>) -> tensor<1x1x2x3xi32> - // CHECK: %[[sIn:.+]] = tosa.sub %[[cIn]], %[[iZp]] - // CHECK: %[[sWe:.+]] = tosa.sub %[[cWe]], %[[wZp]] - // CHECK: %[[resWe:.+]] = tosa.reshape %[[sWe]], %[[CONST3]] - // CHECK: %[[mul:.+]] = tosa.mul %[[sIn]], %[[resWe]], %[[SHIFT]] - // CHECK: %[[reO:.+]] = tosa.reshape %[[mul]], %[[CONST4]] - // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2, %[[CONST5]] - // CHECK: %[[add:.+]] = tosa.add %[[reO]], %[[reArg2]] + // CHECK: %[[RESHAPE_I:.+]] = tosa.reshape %arg0, %[[CONST0]] + // CHECK: %[[CAST_I:.+]] = tosa.cast %[[RESHAPE_I]] : (tensor<4x10x10x2x1xi8>) -> tensor<4x10x10x2x1xi32> + // CHECK: %[[CAST_W:.+]] = tosa.cast %arg1 : (tensor<1x1x2x3xi8>) -> tensor<1x1x2x3xi32> + // CHECK: %[[SUB_I:.+]] = tosa.sub %[[CAST_I]], %[[INPUT_ZP]] + // CHECK: %[[SUB_W:.+]] = tosa.sub %[[CAST_W]], %[[WEIGHT_ZP]] + // CHECK: %[[RESHAPE_W:.+]] = tosa.reshape %[[SUB_W]], %[[CONST3]] + // CHECK: %[[MUL:.+]] = tosa.mul %[[SUB_I]], %[[RESHAPE_W]], %[[SHIFT]] + // CHECK: %[[RESHAPE_O:.+]] = tosa.reshape %[[MUL]], %[[CONST4]] + // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[CONST5]] + // CHECK: %[[ADD:.+]] = tosa.add %[[RESHAPE_O]], %[[RESHAPE_ARG2]] %input_zp = "tosa.const"() {value = dense<7> : tensor<1xi8>} : () -> tensor<1xi8> %weight_zp = "tosa.const"() {value = dense<11> : tensor<1xi8>} : () -> tensor<1xi8> %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = i32, pad = array, stride = array, dilation = array } : (tensor<4x10x10x2xi8>, tensor<1x1x2x3xi8>, tensor<6xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<4x10x10x6xi32> @@ -58,19 +58,19 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor< // CHECK-LABEL: @depthwise_conv2d_as_mul_padded func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x12x12x6xf32> { // CHECK-DAG: %[[CONST0:.+]] = tosa.const_shape {value = dense<[4, 10, 10, 2, 1]> : tensor<5xindex>} - // CHECK-DAG: %[[pad:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xindex>} : () -> !tosa.shape<10> - // CHECK-DAG: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} + // CHECK-DAG: %[[PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xindex>} : () -> !tosa.shape<10> + // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor} // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 2, 3]> : tensor<5xindex>} // CHECK-DAG: %[[CONST4:.+]] = tosa.const_shape {value = dense<[4, 12, 12, 6]> : tensor<4xindex>} // CHECK-DAG: %[[CONST5:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 6]> : tensor<4xindex>} // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> - // CHECK: %[[reIn:.+]] = tosa.reshape %arg0, %[[CONST0]] - // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, !tosa.shape<10>, tensor) -> tensor<4x12x12x2x1xf32> - // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1, %[[CONST3]] - // CHECK: %[[mul:.+]] = tosa.mul %[[padded]], %[[reArg1]], %[[SHIFT]] - // CHECK: %[[reOut:.+]] = tosa.reshape %[[mul]], %[[CONST4]] - // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2, %[[CONST5]] - // CHECK: %[[add:.+]] = tosa.add %[[reOut]], %[[reArg2]] + // CHECK: %[[RESHAPE_I:.+]] = tosa.reshape %arg0, %[[CONST0]] + // CHECK: %[[PAD_I:.+]] = tosa.pad %[[RESHAPE_I]], %[[PAD]], %[[ZERO]] : (tensor<4x10x10x2x1xf32>, !tosa.shape<10>, tensor) -> tensor<4x12x12x2x1xf32> + // CHECK: %[[RESHAPE_ARG1:.+]] = tosa.reshape %arg1, %[[CONST3]] + // CHECK: %[[MUL:.+]] = tosa.mul %[[PAD_I]], %[[RESHAPE_ARG1]], %[[SHIFT]] + // CHECK: %[[RESHAPE_O:.+]] = tosa.reshape %[[MUL]], %[[CONST4]] + // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[CONST5]] + // CHECK: %[[ADD:.+]] = tosa.add %[[RESHAPE_O]], %[[RESHAPE_ARG2]] %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x12x12x6xf32> return %0 : tensor<4x12x12x6xf32> } diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir index bd18b7ea0fdff..ae7a8e90b4281 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir @@ -120,8 +120,8 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1 // CHECK-DAG: %[[CONV_NEW_SHAPE:.*]] = tosa.const_shape {value = dense<[2, 18, 16, 2, 3, 5]> : tensor<6xindex>} // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]], %[[CONV_NEW_SHAPE]] // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]] - // CHECK-DAG: %[[TEANS_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[2, 36, 48, 5]> : tensor<4xindex>} - // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]], %[[TEANS_NEW_SHAPE]] + // CHECK-DAG: %[[TRANS_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[2, 36, 48, 5]> : tensor<4xindex>} + // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]], %[[TRANS_NEW_SHAPE]] // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]], %[[START]], %[[SIZE]] // CHECK-DAG: %[[ARG2_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[1, 1, 1, 5]> : tensor<4xindex>} // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[ARG2_NEW_SHAPE]] diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index fa590ab495ada..7e714d0f8547a 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -501,9 +501,9 @@ func.func @test_slice(%arg0 : tensor) -> () { // CHECK-LABEL: @test_slice_size_minus_one func.func @test_slice_size_minus_one(%arg0 : tensor) -> () { - // CHECK: %[[Start:.+]] = tosa.const_shape - // CHECK: %[[Size:.+]] = tosa.const_shape - // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[Start]], %[[Size]] : (tensor, !tosa.shape<4>, !tosa.shape<4>) -> tensor + // CHECK: %[[START:.+]] = tosa.const_shape + // CHECK: %[[SIZE:.+]] = tosa.const_shape + // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[START]], %[[SIZE]] : (tensor, !tosa.shape<4>, !tosa.shape<4>) -> tensor // this checks following // dim 0: size=-1, input dim=? => inferred output dim is ? // dim 1: size=-1 => inferred output dim is input_dim - start @@ -519,9 +519,9 @@ func.func @test_slice_size_minus_one(%arg0 : tensor) -> () { // CHECK-LABEL: @test_slice_size_out_of_bound func.func @test_slice_size_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () { - // CHECK: %[[Start:.+]] = tosa.const_shape - // CHECK: %[[Size:.+]] = tosa.const_shape - // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[Start]], %[[Size]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor + // CHECK: %[[START:.+]] = tosa.const_shape + // CHECK: %[[SIZE:.+]] = tosa.const_shape + // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[START]], %[[SIZE]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor // this checks following // dim 0: size=0 => inferred output dim is ? // dim 1: size=-2 => inferred output dim is ? @@ -537,9 +537,9 @@ func.func @test_slice_size_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () { // CHECK-LABEL: @test_slice_start_out_of_bound func.func @test_slice_start_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () { - // CHECK: %[[Start:.+]] = tosa.const_shape - // CHECK: %[[Size:.+]] = tosa.const_shape - // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[Start]], %[[Size]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor + // CHECK: %[[START:.+]] = tosa.const_shape + // CHECK: %[[SIZE:.+]] = tosa.const_shape + // CHECK: %[[VAL:.+]] = tosa.slice %arg0, %[[START]], %[[SIZE]] : (tensor<8x8x8x?xi32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor // this checks following // dim 0: start=-1 => inferred output dim is ? // dim 1: start=8 => inferred output dim is ? From 9ebb618d03cb29c37e3178428dcf52e1ac4f1cc2 Mon Sep 17 00:00:00 2001 From: foxtran <39676482+foxtran@users.noreply.github.com> Date: Wed, 19 Feb 2025 19:05:44 +0100 Subject: [PATCH 098/220] [Clang] [Sema] Combine fallout warnings to just one warning (#127546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This merges several falloff and noreturn-related warnings and removes unused diagnostic arguments. Changes: - `warn_maybe_falloff_nonvoid_function` and `warn_falloff_nonvoid_function`, `warn_maybe_falloff_nonvoid_coroutine` and `warn_falloff_nonvoid_coroutine`, `warn_maybe_falloff_nonvoid_lambda` and `warn_falloff_nonvoid_lambda` were combined into `warn_falloff_nonvoid`, - `err_maybe_falloff_nonvoid_block` and `err_falloff_nonvoid_block` were combined into `err_falloff_nonvoid` - `err_noreturn_block_has_return_expr` and `err_noreturn_lambda_has_return_expr` were merged into `err_noreturn_has_return_expr` with the same semantics as `warn_falloff_nonvoid` or `err_falloff_nonvoid`. - Removed some diagnostic args that weren’t being used by the diagnostics. --- .../clang/Basic/DiagnosticSemaKinds.td | 41 ++--- clang/lib/Sema/AnalysisBasedWarnings.cpp | 144 ++++++------------ clang/lib/Sema/SemaStmt.cpp | 6 +- 3 files changed, 66 insertions(+), 125 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index ee1ad214d81df..feef50812eca9 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -713,13 +713,14 @@ def err_thread_non_global : Error< def err_thread_unsupported : Error< "thread-local storage is not supported for the current target">; -// FIXME: Combine fallout warnings to just one warning. -def warn_maybe_falloff_nonvoid_function : Warning< - "non-void function does not return a value in all control paths">, - InGroup; -def warn_falloff_nonvoid_function : Warning< - "non-void function does not return a value">, +def warn_falloff_nonvoid : Warning< + "non-void " + "%enum_select{%Function{function}|%Block{block}|%Lambda{lambda}|%Coroutine{coroutine}}0" + " does not return a value%select{| in all control paths}1">, InGroup; +def err_falloff_nonvoid : Error< + "non-void %select{function|block|lambda|coroutine}0 " + "does not return a value%select{| in all control paths}1">; def warn_const_attr_with_pure_attr : Warning< "'const' attribute imposes more restrictions; 'pure' attribute ignored">, InGroup; @@ -727,16 +728,6 @@ def warn_pure_function_returns_void : Warning< "'%select{pure|const}0' attribute on function returning 'void'; attribute ignored">, InGroup; -def err_maybe_falloff_nonvoid_block : Error< - "non-void block does not return a value in all control paths">; -def err_falloff_nonvoid_block : Error< - "non-void block does not return a value">; -def warn_maybe_falloff_nonvoid_coroutine : Warning< - "non-void coroutine does not return a value in all control paths">, - InGroup; -def warn_falloff_nonvoid_coroutine : Warning< - "non-void coroutine does not return a value">, - InGroup; def warn_suggest_noreturn_function : Warning< "%select{function|method}0 %1 could be declared with attribute 'noreturn'">, InGroup, DefaultIgnore; @@ -8406,14 +8397,6 @@ let CategoryName = "Lambda Issue" in { "lambda expression in default argument cannot capture any entity">; def err_lambda_incomplete_result : Error< "incomplete result type %0 in lambda expression">; - def err_noreturn_lambda_has_return_expr : Error< - "lambda declared 'noreturn' should not return">; - def warn_maybe_falloff_nonvoid_lambda : Warning< - "non-void lambda does not return a value in all control paths">, - InGroup; - def warn_falloff_nonvoid_lambda : Warning< - "non-void lambda does not return a value">, - InGroup; def err_access_lambda_capture : Error< // The ERRORs represent other special members that aren't constructors, in // hopes that someone will bother noticing and reporting if they appear @@ -10603,14 +10586,16 @@ def err_ctor_dtor_returns_void : Error< def warn_noreturn_function_has_return_expr : Warning< "function %0 declared 'noreturn' should not return">, InGroup; -def warn_falloff_noreturn_function : Warning< - "function declared 'noreturn' should not return">, +def warn_noreturn_has_return_expr : Warning< + "%select{function|block|lambda|coroutine}0 " + "declared 'noreturn' should not return">, InGroup; +def err_noreturn_has_return_expr : Error< + "%select{function|block|lambda|coroutine}0 " + "declared 'noreturn' should not return">; def warn_noreturn_coroutine : Warning< "coroutine %0 cannot be declared 'noreturn' as it always returns a coroutine handle">, InGroup; -def err_noreturn_block_has_return_expr : Error< - "block declared 'noreturn' should not return">; def err_carries_dependency_missing_on_first_decl : Error< "%select{function|parameter}0 declared '[[carries_dependency]]' " "after its first declaration">; diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index ce7d9be8d2faa..f21e571e6e0ce 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -544,25 +544,17 @@ static ControlFlowKind CheckFallThrough(AnalysisDeclContext &AC) { namespace { struct CheckFallThroughDiagnostics { - unsigned diag_MaybeFallThrough_HasNoReturn; - unsigned diag_MaybeFallThrough_ReturnsNonVoid; - unsigned diag_AlwaysFallThrough_HasNoReturn; - unsigned diag_AlwaysFallThrough_ReturnsNonVoid; - unsigned diag_NeverFallThroughOrReturn; - enum { Function, Block, Lambda, Coroutine } funMode; + unsigned diag_FallThrough_HasNoReturn = 0; + unsigned diag_FallThrough_ReturnsNonVoid = 0; + unsigned diag_NeverFallThroughOrReturn = 0; + unsigned FunKind; // TODO: use diag::FalloffFunctionKind SourceLocation FuncLoc; static CheckFallThroughDiagnostics MakeForFunction(const Decl *Func) { CheckFallThroughDiagnostics D; D.FuncLoc = Func->getLocation(); - D.diag_MaybeFallThrough_HasNoReturn = - diag::warn_falloff_noreturn_function; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::warn_maybe_falloff_nonvoid_function; - D.diag_AlwaysFallThrough_HasNoReturn = - diag::warn_falloff_noreturn_function; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::warn_falloff_nonvoid_function; + D.diag_FallThrough_HasNoReturn = diag::warn_noreturn_has_return_expr; + D.diag_FallThrough_ReturnsNonVoid = diag::warn_falloff_nonvoid; // Don't suggest that virtual functions be marked "noreturn", since they // might be overridden by non-noreturn functions. @@ -576,76 +568,49 @@ struct CheckFallThroughDiagnostics { isTemplateInstantiation = Function->isTemplateInstantiation(); if (!isVirtualMethod && !isTemplateInstantiation) - D.diag_NeverFallThroughOrReturn = - diag::warn_suggest_noreturn_function; - else - D.diag_NeverFallThroughOrReturn = 0; + D.diag_NeverFallThroughOrReturn = diag::warn_suggest_noreturn_function; - D.funMode = Function; + D.FunKind = diag::FalloffFunctionKind::Function; return D; } static CheckFallThroughDiagnostics MakeForCoroutine(const Decl *Func) { CheckFallThroughDiagnostics D; D.FuncLoc = Func->getLocation(); - D.diag_MaybeFallThrough_HasNoReturn = 0; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::warn_maybe_falloff_nonvoid_coroutine; - D.diag_AlwaysFallThrough_HasNoReturn = 0; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::warn_falloff_nonvoid_coroutine; - D.diag_NeverFallThroughOrReturn = 0; - D.funMode = Coroutine; + D.diag_FallThrough_ReturnsNonVoid = diag::warn_falloff_nonvoid; + D.FunKind = diag::FalloffFunctionKind::Coroutine; return D; } static CheckFallThroughDiagnostics MakeForBlock() { CheckFallThroughDiagnostics D; - D.diag_MaybeFallThrough_HasNoReturn = - diag::err_noreturn_block_has_return_expr; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::err_maybe_falloff_nonvoid_block; - D.diag_AlwaysFallThrough_HasNoReturn = - diag::err_noreturn_block_has_return_expr; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::err_falloff_nonvoid_block; - D.diag_NeverFallThroughOrReturn = 0; - D.funMode = Block; + D.diag_FallThrough_HasNoReturn = diag::err_noreturn_has_return_expr; + D.diag_FallThrough_ReturnsNonVoid = diag::err_falloff_nonvoid; + D.FunKind = diag::FalloffFunctionKind::Block; return D; } static CheckFallThroughDiagnostics MakeForLambda() { CheckFallThroughDiagnostics D; - D.diag_MaybeFallThrough_HasNoReturn = - diag::err_noreturn_lambda_has_return_expr; - D.diag_MaybeFallThrough_ReturnsNonVoid = - diag::warn_maybe_falloff_nonvoid_lambda; - D.diag_AlwaysFallThrough_HasNoReturn = - diag::err_noreturn_lambda_has_return_expr; - D.diag_AlwaysFallThrough_ReturnsNonVoid = - diag::warn_falloff_nonvoid_lambda; - D.diag_NeverFallThroughOrReturn = 0; - D.funMode = Lambda; + D.diag_FallThrough_HasNoReturn = diag::err_noreturn_has_return_expr; + D.diag_FallThrough_ReturnsNonVoid = diag::warn_falloff_nonvoid; + D.FunKind = diag::FalloffFunctionKind::Lambda; return D; } bool checkDiagnostics(DiagnosticsEngine &D, bool ReturnsVoid, bool HasNoReturn) const { - if (funMode == Function) { + if (FunKind == diag::FalloffFunctionKind::Function) { return (ReturnsVoid || - D.isIgnored(diag::warn_maybe_falloff_nonvoid_function, - FuncLoc)) && + D.isIgnored(diag::warn_falloff_nonvoid, FuncLoc)) && (!HasNoReturn || - D.isIgnored(diag::warn_noreturn_function_has_return_expr, - FuncLoc)) && + D.isIgnored(diag::warn_noreturn_has_return_expr, FuncLoc)) && (!ReturnsVoid || D.isIgnored(diag::warn_suggest_noreturn_block, FuncLoc)); } - if (funMode == Coroutine) { + if (FunKind == diag::FalloffFunctionKind::Coroutine) { return (ReturnsVoid || - D.isIgnored(diag::warn_maybe_falloff_nonvoid_function, FuncLoc) || - D.isIgnored(diag::warn_maybe_falloff_nonvoid_coroutine, - FuncLoc)) && + D.isIgnored(diag::warn_falloff_nonvoid, FuncLoc)) && (!HasNoReturn); } // For blocks / lambdas. @@ -662,12 +627,10 @@ struct CheckFallThroughDiagnostics { static void CheckFallThroughForBody(Sema &S, const Decl *D, const Stmt *Body, QualType BlockType, const CheckFallThroughDiagnostics &CD, - AnalysisDeclContext &AC, - sema::FunctionScopeInfo *FSI) { + AnalysisDeclContext &AC) { bool ReturnsVoid = false; bool HasNoReturn = false; - bool IsCoroutine = FSI->isCoroutine(); if (const auto *FD = dyn_cast(D)) { if (const auto *CBody = dyn_cast(Body)) @@ -696,49 +659,40 @@ static void CheckFallThroughForBody(Sema &S, const Decl *D, const Stmt *Body, if (CD.checkDiagnostics(Diags, ReturnsVoid, HasNoReturn)) return; SourceLocation LBrace = Body->getBeginLoc(), RBrace = Body->getEndLoc(); - auto EmitDiag = [&](SourceLocation Loc, unsigned DiagID) { - if (IsCoroutine) { - if (DiagID != 0) - S.Diag(Loc, DiagID) << FSI->CoroutinePromise->getType(); - } else { - S.Diag(Loc, DiagID); - } - }; // cpu_dispatch functions permit empty function bodies for ICC compatibility. if (D->getAsFunction() && D->getAsFunction()->isCPUDispatchMultiVersion()) return; // Either in a function body compound statement, or a function-try-block. - switch (CheckFallThrough(AC)) { - case UnknownFallThrough: - break; + switch (int FallThroughType = CheckFallThrough(AC)) { + case UnknownFallThrough: + break; - case MaybeFallThrough: - if (HasNoReturn) - EmitDiag(RBrace, CD.diag_MaybeFallThrough_HasNoReturn); - else if (!ReturnsVoid) - EmitDiag(RBrace, CD.diag_MaybeFallThrough_ReturnsNonVoid); - break; - case AlwaysFallThrough: - if (HasNoReturn) - EmitDiag(RBrace, CD.diag_AlwaysFallThrough_HasNoReturn); - else if (!ReturnsVoid) - EmitDiag(RBrace, CD.diag_AlwaysFallThrough_ReturnsNonVoid); - break; - case NeverFallThroughOrReturn: - if (ReturnsVoid && !HasNoReturn && CD.diag_NeverFallThroughOrReturn) { - if (const FunctionDecl *FD = dyn_cast(D)) { - S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 0 << FD; - } else if (const ObjCMethodDecl *MD = dyn_cast(D)) { - S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 1 << MD; - } else { - S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn); - } + case MaybeFallThrough: + case AlwaysFallThrough: + if (HasNoReturn) { + if (CD.diag_FallThrough_HasNoReturn) + S.Diag(RBrace, CD.diag_FallThrough_HasNoReturn) << CD.FunKind; + } else if (!ReturnsVoid && CD.diag_FallThrough_ReturnsNonVoid) { + bool NotInAllControlPaths = FallThroughType == MaybeFallThrough; + S.Diag(RBrace, CD.diag_FallThrough_ReturnsNonVoid) + << CD.FunKind << NotInAllControlPaths; + } + break; + case NeverFallThroughOrReturn: + if (ReturnsVoid && !HasNoReturn && CD.diag_NeverFallThroughOrReturn) { + if (const FunctionDecl *FD = dyn_cast(D)) { + S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 0 << FD; + } else if (const ObjCMethodDecl *MD = dyn_cast(D)) { + S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn) << 1 << MD; + } else { + S.Diag(LBrace, CD.diag_NeverFallThroughOrReturn); } - break; - case NeverFallThrough: - break; + } + break; + case NeverFallThrough: + break; } } @@ -2765,7 +2719,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( : (fscope->isCoroutine() ? CheckFallThroughDiagnostics::MakeForCoroutine(D) : CheckFallThroughDiagnostics::MakeForFunction(D))); - CheckFallThroughForBody(S, D, Body, BlockType, CD, AC, fscope); + CheckFallThroughForBody(S, D, Body, BlockType, CD, AC); } // Warning: check for unreachable code diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 0394edb7889ba..d0b713f074c33 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3590,7 +3590,8 @@ StmtResult Sema::ActOnCapScopeReturnStmt(SourceLocation ReturnLoc, if (auto *CurBlock = dyn_cast(CurCap)) { if (CurBlock->FunctionType->castAs()->getNoReturnAttr()) { - Diag(ReturnLoc, diag::err_noreturn_block_has_return_expr); + Diag(ReturnLoc, diag::err_noreturn_has_return_expr) + << diag::FalloffFunctionKind::Block; return StmtError(); } } else if (auto *CurRegion = dyn_cast(CurCap)) { @@ -3601,7 +3602,8 @@ StmtResult Sema::ActOnCapScopeReturnStmt(SourceLocation ReturnLoc, if (CurLambda->CallOperator->getType() ->castAs() ->getNoReturnAttr()) { - Diag(ReturnLoc, diag::err_noreturn_lambda_has_return_expr); + Diag(ReturnLoc, diag::err_noreturn_has_return_expr) + << diag::FalloffFunctionKind::Lambda; return StmtError(); } } From 2bf473bd546e65f8fc2f0d5006b8c8ef07259e24 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 19 Feb 2025 10:17:07 -0800 Subject: [PATCH 099/220] [GlobalOpt] Don't query TTI on a llvm.memcpy declaration. (#127760) Querying TTI creates a Subtarget object, but an llvm.memcpy declaration doesn't have target-cpu and target-feature attributes like functions with definitions. This can cause a warning to be printed on RISC-V because the target-abi in the Module requires floating point, but the subtarget features don't enable floating point. So far we've only seen this in LTO when an -mcpu is not supplied for the TargetMachine. To fix this, get TTI for the calling function instead. Fixes the issue reported here https://github.com/llvm/llvm-project/issues/69780#issuecomment-2665273161 --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 9586fc97a39f7..1a2a27d22ae68 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2186,8 +2186,10 @@ static bool tryWidenGlobalArraysUsedByMemcpy( if (NumElementsToCopy != DZSize || DZSize != SZSize) continue; - unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray( - NumBytesToCopy, SourceDataArray->getType()); + unsigned NumBytesToPad = + GetTTI(*CI->getFunction()) + .getNumBytesToPadGlobalArray(NumBytesToCopy, + SourceDataArray->getType()); if (NumBytesToPad) { return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy, BytesToCopyOp, SourceDataArray); From a6f48ed01292d0007e19a2605cba1acd4ecd123a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 19 Feb 2025 10:19:40 -0800 Subject: [PATCH 100/220] [MC] Remove MCRegister::isStackSlot. (#127755) Stack slots should only be stored in Register. The only caller was Register::isStackSlot so just inline it there. --- llvm/include/llvm/CodeGen/Register.h | 5 +++-- llvm/include/llvm/MC/MCRegister.h | 8 -------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h index 8a0bf3dc71ad2..ad05368bea6a4 100644 --- a/llvm/include/llvm/CodeGen/Register.h +++ b/llvm/include/llvm/CodeGen/Register.h @@ -42,11 +42,12 @@ class Register { /// /// FIXME: remove in favor of member. static constexpr bool isStackSlot(unsigned Reg) { - return MCRegister::isStackSlot(Reg); + return MCRegister::FirstStackSlot <= Reg && + Reg < MCRegister::VirtualRegFlag; } /// Return true if this is a stack slot. - constexpr bool isStack() const { return MCRegister::isStackSlot(Reg); } + constexpr bool isStack() const { return isStackSlot(Reg); } /// Compute the frame index from a register value representing a stack slot. static int stackSlot2Index(Register Reg) { diff --git a/llvm/include/llvm/MC/MCRegister.h b/llvm/include/llvm/MC/MCRegister.h index 53005bb03c2ee..16d0709753b35 100644 --- a/llvm/include/llvm/MC/MCRegister.h +++ b/llvm/include/llvm/MC/MCRegister.h @@ -54,14 +54,6 @@ class MCRegister { static constexpr unsigned FirstStackSlot = 1u << 30; static constexpr unsigned VirtualRegFlag = 1u << 31; - /// This is the portion of the positive number space that is not a physical - /// register. StackSlot values do not exist in the MC layer, see - /// Register::isStackSlot() for the more information on them. - /// - static constexpr bool isStackSlot(unsigned Reg) { - return FirstStackSlot <= Reg && Reg < VirtualRegFlag; - } - /// Return true if the specified register number is in /// the physical register namespace. static constexpr bool isPhysicalRegister(unsigned Reg) { From d57479cfbe9a6b4dccedfd1221c04973ad90ec97 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Wed, 19 Feb 2025 10:19:57 -0800 Subject: [PATCH 101/220] [mlir][tosa] Update SelectOp's input names to match TOSA specification (#127833) Updated: - pred to input1 - on_true to input2 - on_false to input3 Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 8 ++++---- mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp | 14 +++++++------- .../Tosa/Transforms/TosaMakeBroadcastable.cpp | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 4d5837ca26c91..7cdf79f4dc59d 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1190,9 +1190,9 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> { }]; let arguments = (ins - Tosa_I1Tensor:$pred, - Tosa_Tensor:$on_true, - Tosa_Tensor:$on_false + Tosa_I1Tensor:$input1, + Tosa_Tensor:$input2, + Tosa_Tensor:$input3 ); let results = (outs @@ -1202,7 +1202,7 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> { let hasFolder = 1; let assemblyFormat = [{ - operands attr-dict `:` `(` type($pred) `,` type($on_true) `,` type($on_false) + operands attr-dict `:` `(` type($input1) `,` type($input2) `,` type($input3) `)` `->` type($output) }]; } diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index b9bcedb7fe71d..9bfc2aae1d6a5 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -65,12 +65,12 @@ void ConcatOp::getCanonicalizationPatterns(RewritePatternSet &results, } LogicalResult SelectOp::canonicalize(SelectOp op, PatternRewriter &rewriter) { - auto notOp = op.getPred().getDefiningOp(); + auto notOp = op.getInput1().getDefiningOp(); if (!notOp) return failure(); rewriter.modifyOpInPlace(op, [&]() { op.getOperation()->setOperands( - {notOp.getInput1(), op.getOnFalse(), op.getOnTrue()}); + {notOp.getInput1(), op.getInput3(), op.getInput2()}); }); return success(); } @@ -1131,18 +1131,18 @@ OpFoldResult SliceOp::fold(FoldAdaptor adaptor) { } OpFoldResult tosa::SelectOp::fold(FoldAdaptor adaptor) { - if (getOnTrue() == getOnFalse()) - return getOnTrue(); + if (getInput2() == getInput3()) + return getInput2(); auto predicate = - llvm::dyn_cast_if_present(adaptor.getPred()); + llvm::dyn_cast_if_present(adaptor.getInput1()); if (!predicate) return {}; if (!predicate.isSplat()) return {}; - return predicate.getSplatValue().getBoolValue() ? getOnTrue() - : getOnFalse(); + return predicate.getSplatValue().getBoolValue() ? getInput2() + : getInput3(); } OpFoldResult TileOp::fold(FoldAdaptor adaptor) { diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp index 79afc75fd6c8e..87b2a2695351b 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp @@ -169,9 +169,9 @@ struct ConvertTosaOp : public OpRewritePattern { LogicalResult matchAndRewrite(tosa::SelectOp tosaOp, PatternRewriter &rewriter) const override { - Value input1 = tosaOp.getPred(); - Value input2 = tosaOp.getOnTrue(); - Value input3 = tosaOp.getOnFalse(); + Value input1 = tosaOp.getInput1(); + Value input2 = tosaOp.getInput2(); + Value input3 = tosaOp.getInput3(); Value output = tosaOp.getResult(); auto outputType = dyn_cast(output.getType()); From 1841bcd5d16310c052c424dec3bcf2b703badd40 Mon Sep 17 00:00:00 2001 From: David Goldman Date: Wed, 19 Feb 2025 13:21:41 -0500 Subject: [PATCH 102/220] [clangd] Update XRefs to support overridden ObjC methods (#127109) - Support finding implementors of a protocol and discovering subclasses for ObjC interfaces via the implementations call - Support jumping to the overridden method when you trigger goto definition on an override - Properly find references to overridden methods --- clang-tools-extra/clangd/XRefs.cpp | 56 +++++++ .../clangd/unittests/SymbolCollectorTests.cpp | 36 ++++ .../clangd/unittests/XRefsTests.cpp | 155 ++++++++++++++++++ 3 files changed, 247 insertions(+) diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp index 1a23f6cca7756..8b9fffa3f64cd 100644 --- a/clang-tools-extra/clangd/XRefs.cpp +++ b/clang-tools-extra/clangd/XRefs.cpp @@ -372,6 +372,15 @@ void enhanceLocatedSymbolsFromIndex(llvm::MutableArrayRef Result, }); } +bool objcMethodIsTouched(const SourceManager &SM, const ObjCMethodDecl *OMD, + SourceLocation Loc) { + unsigned NumSels = OMD->getNumSelectorLocs(); + for (unsigned I = 0; I < NumSels; ++I) + if (SM.getSpellingLoc(OMD->getSelectorLoc(I)) == Loc) + return true; + return false; +} + // Decls are more complicated. // The AST contains at least a declaration, maybe a definition. // These are up-to-date, and so generally preferred over index results. @@ -430,6 +439,26 @@ locateASTReferent(SourceLocation CurLoc, const syntax::Token *TouchedIdentifier, continue; } } + // Special case: - (void)^method {} should jump to overrides, but the decl + // shouldn't, only the definition. Note that an Objective-C method can + // override a parent class or protocol. + // + // FIXME: Support jumping from a protocol decl to overrides on go-to + // definition. + if (const auto *OMD = llvm::dyn_cast(D)) { + if (OMD->isThisDeclarationADefinition() && TouchedIdentifier && + objcMethodIsTouched(SM, OMD, TouchedIdentifier->location())) { + llvm::SmallVector Overrides; + OMD->getOverriddenMethods(Overrides); + if (!Overrides.empty()) { + for (const auto *Override : Overrides) + AddResultDecl(Override); + LocateASTReferentMetric.record(1, "objc-overriden-method"); + } + AddResultDecl(OMD); + continue; + } + } // Special case: the cursor is on an alias, prefer other results. // This targets "using ns::^Foo", where the target is more interesting. @@ -1283,6 +1312,12 @@ std::vector findImplementations(ParsedAST &AST, Position Pos, } else if (const auto *RD = dyn_cast(ND)) { IDs.insert(getSymbolID(RD)); QueryKind = RelationKind::BaseOf; + } else if (const auto *OMD = dyn_cast(ND)) { + IDs.insert(getSymbolID(OMD)); + QueryKind = RelationKind::OverriddenBy; + } else if (const auto *ID = dyn_cast(ND)) { + IDs.insert(getSymbolID(ID)); + QueryKind = RelationKind::BaseOf; } } return findImplementors(std::move(IDs), QueryKind, Index, AST.tuPath()); @@ -1302,6 +1337,21 @@ void getOverriddenMethods(const CXXMethodDecl *CMD, } } +// Recursively finds all the overridden methods of `OMD` in complete type +// hierarchy. +void getOverriddenMethods(const ObjCMethodDecl *OMD, + llvm::DenseSet &OverriddenMethods) { + if (!OMD) + return; + llvm::SmallVector Overrides; + OMD->getOverriddenMethods(Overrides); + for (const ObjCMethodDecl *Base : Overrides) { + if (auto ID = getSymbolID(Base)) + OverriddenMethods.insert(ID); + getOverriddenMethods(Base, OverriddenMethods); + } +} + std::optional stringifyContainerForMainFileRef(const Decl *Container) { // FIXME We might also want to display the signature here @@ -1438,6 +1488,12 @@ ReferencesResult findReferences(ParsedAST &AST, Position Pos, uint32_t Limit, getOverriddenMethods(CMD, OverriddenMethods); } } + // Special case: Objective-C methods can override a parent class or + // protocol, we should be sure to report references to those. + if (const auto *OMD = llvm::dyn_cast(ND)) { + OverriddenBy.Subjects.insert(getSymbolID(OMD)); + getOverriddenMethods(OMD, OverriddenMethods); + } } } diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp index 7a9703c744e93..1ce28c91a420c 100644 --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -1335,6 +1335,42 @@ TEST_F(SymbolCollectorTest, OverrideRelationsMultipleInheritance) { OverriddenBy(CBar, DBar), OverriddenBy(CBaz, DBaz))); } +TEST_F(SymbolCollectorTest, ObjCOverrideRelationsSimpleInheritance) { + std::string Header = R"cpp( + @interface A + - (void)foo; + @end + @interface B : A + - (void)foo; // A::foo + - (void)bar; + @end + @interface C : B + - (void)bar; // B::bar + @end + @interface D : C + - (void)foo; // B::foo + - (void)bar; // C::bar + @end + )cpp"; + runSymbolCollector(Header, /*Main=*/"", + {"-xobjective-c++", "-Wno-objc-root-class"}); + const Symbol &AFoo = findSymbol(Symbols, "A::foo"); + const Symbol &BFoo = findSymbol(Symbols, "B::foo"); + const Symbol &DFoo = findSymbol(Symbols, "D::foo"); + + const Symbol &BBar = findSymbol(Symbols, "B::bar"); + const Symbol &CBar = findSymbol(Symbols, "C::bar"); + const Symbol &DBar = findSymbol(Symbols, "D::bar"); + + std::vector Result; + for (const Relation &R : Relations) + if (R.Predicate == RelationKind::OverriddenBy) + Result.push_back(R); + EXPECT_THAT(Result, UnorderedElementsAre( + OverriddenBy(AFoo, BFoo), OverriddenBy(BBar, CBar), + OverriddenBy(BFoo, DFoo), OverriddenBy(CBar, DBar))); +} + TEST_F(SymbolCollectorTest, CountReferences) { const std::string Header = R"( class W; diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp index 7d824d659ad2c..475b56b1dc230 100644 --- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp +++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp @@ -411,6 +411,85 @@ TEST(LocateSymbol, FindOverrides) { sym("foo", Code.range("2"), std::nullopt))); } +TEST(LocateSymbol, FindOverridesFromDefObjC) { + auto Code = Annotations(R"objc( + @protocol Fooey + - (void)foo; + @end + @interface Base + - (void)foo; + @end + @interface Foo : Base + - (void)$1[[foo]]; + @end + + @interface Bar : Foo + - (void)$2[[foo]]; + @end + @implementation Bar + - (void)$3[[fo^o]] {} + @end + )objc"); + TestTU TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + EXPECT_THAT( + locateSymbolAt(AST, Code.point(), TU.index().get()), + UnorderedElementsAre(sym("foo", Code.range("1"), std::nullopt), + sym("foo", Code.range("2"), Code.range("3")))); +} + +TEST(LocateSymbol, NoOverridesFromDeclObjC) { + auto Code = Annotations(R"objc( + @protocol Fooey + - (void)foo; + @end + @interface Base + - (void)foo; + @end + @interface Foo : Base + - (void)foo; + @end + + @interface Bar : Foo + - (void)$2[[fo^o]]; + @end + @implementation Bar + - (void)$3[[foo]] {} + @end + )objc"); + TestTU TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + EXPECT_THAT( + locateSymbolAt(AST, Code.point(), TU.index().get()), + UnorderedElementsAre(sym("foo", Code.range("2"), Code.range("3")))); +} + +TEST(LocateSymbol, ObjCNoOverridesOnUsage) { + auto Code = Annotations(R"objc( + @interface Foo + - (void)foo; + @end + + @interface Bar : Foo + - (void)$1[[foo]]; + @end + @implementation Bar + - (void)$2[[foo]] {} + @end + void doSomething(Bar *bar) { + [bar fo^o]; + } + )objc"); + TestTU TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + EXPECT_THAT( + locateSymbolAt(AST, Code.point(), TU.index().get()), + UnorderedElementsAre(sym("foo", Code.range("1"), Code.range("2")))); +} + TEST(LocateSymbol, WithIndexPreferredLocation) { Annotations SymbolHeader(R"cpp( class $p[[Proto]] {}; @@ -1834,6 +1913,41 @@ TEST(FindImplementations, Inheritance) { } } +TEST(FindImplementations, InheritanceObjC) { + llvm::StringRef Test = R"objc( + @interface $base^Base + - (void)fo$foo^o; + @end + @protocol Protocol + - (void)$protocol^protocol; + @end + @interface $ChildDecl[[Child]] : Base + - (void)concrete; + - (void)$fooDecl[[foo]]; + @end + @implementation $ChildDef[[Child]] + - (void)concrete {} + - (void)$fooDef[[foo]] {} + - (void)$protocolDef[[protocol]] {} + @end + )objc"; + + Annotations Code(Test); + auto TU = TestTU::withCode(Code.code()); + TU.ExtraArgs.push_back("-xobjective-c++"); + auto AST = TU.build(); + auto Index = TU.index(); + EXPECT_THAT(findImplementations(AST, Code.point("base"), Index.get()), + UnorderedElementsAre(sym("Child", Code.range("ChildDecl"), + Code.range("ChildDef")))); + EXPECT_THAT(findImplementations(AST, Code.point("foo"), Index.get()), + UnorderedElementsAre( + sym("foo", Code.range("fooDecl"), Code.range("fooDef")))); + EXPECT_THAT(findImplementations(AST, Code.point("protocol"), Index.get()), + UnorderedElementsAre(sym("protocol", Code.range("protocolDef"), + Code.range("protocolDef")))); +} + TEST(FindImplementations, CaptureDefinition) { llvm::StringRef Test = R"cpp( struct Base { @@ -1963,6 +2077,7 @@ void checkFindRefs(llvm::StringRef Test, bool UseIndex = false) { Annotations T(Test); auto TU = TestTU::withCode(T.code()); TU.ExtraArgs.push_back("-std=c++20"); + TU.ExtraArgs.push_back("-xobjective-c++"); auto AST = TU.build(); std::vector> ExpectedLocations; @@ -2260,6 +2375,25 @@ TEST(FindReferences, IncludeOverrides) { checkFindRefs(Test, /*UseIndex=*/true); } +TEST(FindReferences, IncludeOverridesObjC) { + llvm::StringRef Test = + R"objc( + @interface Base + - (void)$decl(Base)[[f^unc]]; + @end + @interface Derived : Base + - (void)$overridedecl(Derived::func)[[func]]; + @end + @implementation Derived + - (void)$overridedef[[func]] {} + @end + void test(Derived *derived, Base *base) { + [derived func]; // No references to the overrides. + [base $(test)[[func]]]; + })objc"; + checkFindRefs(Test, /*UseIndex=*/true); +} + TEST(FindReferences, RefsToBaseMethod) { llvm::StringRef Test = R"cpp( @@ -2284,6 +2418,27 @@ TEST(FindReferences, RefsToBaseMethod) { checkFindRefs(Test, /*UseIndex=*/true); } +TEST(FindReferences, RefsToBaseMethodObjC) { + llvm::StringRef Test = + R"objc( + @interface BaseBase + - (void)$(BaseBase)[[func]]; + @end + @interface Base : BaseBase + - (void)$(Base)[[func]]; + @end + @interface Derived : Base + - (void)$decl(Derived)[[fu^nc]]; + @end + void test(BaseBase *bb, Base *b, Derived *d) { + // refs to overridden methods in complete type hierarchy are reported. + [bb $(test)[[func]]]; + [b $(test)[[func]]]; + [d $(test)[[fu^nc]]]; + })objc"; + checkFindRefs(Test, /*UseIndex=*/true); +} + TEST(FindReferences, MainFileReferencesOnly) { llvm::StringRef Test = R"cpp( From 8337d01e3058e7f47675f5b2b908b4e7821895d7 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 19 Feb 2025 19:27:42 +0100 Subject: [PATCH 103/220] [MLIR] Fix doc build (NFC) --- mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt index ac8c651cdced8..610170f5944eb 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_dialect(EmitC emitc) -add_mlir_doc(EmitC EmitC Dialects/ -gen-dialect-doc) +add_mlir_doc(EmitC EmitC Dialects/ -gen-dialect-doc -dialect emitc) set(LLVM_TARGET_DEFINITIONS EmitCAttributes.td) mlir_tablegen(EmitCEnums.h.inc -gen-enum-decls) From 36eaf0daf5d6dd665d7c7a9ec38ea22f27709fed Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Wed, 19 Feb 2025 11:16:43 -0800 Subject: [PATCH 104/220] AMDGPU: Don't canonicalize fminnum/fmaxnum if targets support IEEE fminimum(maximum)_num (#127711) For targets that support IEEE fminimum_num/fmaximum_num, the corresponding *_min_num_fXY/*_max_num_fXY instructions themselves already did the canonicalization for the inputs. As a result, we do not need to explicitly canonicalize the inputs for fminnum/fmaxnum. --- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 3 +- llvm/include/llvm/CodeGen/TargetLowering.h | 3 +- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 4 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 5 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 61 +-- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 61 +-- .../GlobalISel/clamp-fmed3-const-combine.ll | 4 - .../GlobalISel/fmed3-min-max-const-combine.ll | 8 - .../buffer-fat-pointer-atomicrmw-fmax.ll | 195 ++++---- .../buffer-fat-pointer-atomicrmw-fmin.ll | 195 ++++---- llvm/test/CodeGen/AMDGPU/clamp.ll | 9 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 454 ++++++++---------- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 454 ++++++++---------- llvm/test/CodeGen/AMDGPU/fmin3.ll | 30 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 318 +++++------- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 318 +++++------- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 72 ++- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 72 ++- llvm/test/CodeGen/AMDGPU/minmax.ll | 102 +--- 22 files changed, 954 insertions(+), 1424 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 4e18f5cc913a7..50eff989feda0 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -425,7 +425,8 @@ class LegalizerHelper { LegalizeResult lowerThreewayCompare(MachineInstr &MI); LegalizeResult lowerMinMax(MachineInstr &MI); LegalizeResult lowerFCopySign(MachineInstr &MI); - LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI); + LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI, + bool ShouldCanonicalize = true); LegalizeResult lowerFMad(MachineInstr &MI); LegalizeResult lowerIntrinsicRound(MachineInstr &MI); LegalizeResult lowerFFloor(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index a4c3d042fe3a4..7ec945d3a0108 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5314,7 +5314,8 @@ class TargetLowering : public TargetLoweringBase { SelectionDAG &DAG) const; /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. - SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; + SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG, + bool ShouldCanonicalize = true) const; /// Expand fminimum/fmaximum into multiple comparison with selects. SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d4cb224c35d74..319c4ac28c167 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8137,14 +8137,14 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) { } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { +LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI, bool ShouldCanonicalize) { unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ? TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE; auto [Dst, Src0, Src1] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Dst); - if (!MI.getFlag(MachineInstr::FmNoNans)) { + if (ShouldCanonicalize && !MI.getFlag(MachineInstr::FmNoNans)) { // Insert canonicalizes if it's possible we need to quiet to get correct // sNaN behavior. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 7771958f5adc9..5804a42172a7b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8488,7 +8488,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, } SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, - SelectionDAG &DAG) const { + SelectionDAG &DAG, + bool ShouldCanonicalize) const { if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG)) return Expanded; @@ -8505,7 +8506,7 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SDValue Quiet0 = Node->getOperand(0); SDValue Quiet1 = Node->getOperand(1); - if (!Node->getFlags().hasNoNaNs()) { + if (ShouldCanonicalize && !Node->getFlags().hasNoNaNs()) { // Insert canonicalizes if it's possible we need to quiet to get correct // sNaN behavior. if (!DAG.isKnownNeverSNaN(Quiet0)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 649deee346e90..4ce8ffb39599b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2710,7 +2710,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, if (IsIEEEOp) return true; - return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; + return Helper.lowerFMinNumMaxNum(MI, !ST.hasIEEEMinNumMaxNum()) == + LegalizerHelper::Legalized; } bool AMDGPULegalizerInfo::legalizeExtractVectorElt( diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6664a70572ded..56162d18e039d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1428,6 +1428,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // \returns true if the target has IEEE fminimum/fmaximum instructions bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } + // \returns true if the target has IEEE fminimum_num/fmaximum_num + // instructions + bool hasIEEEMinNumMaxNum() const { return getGeneration() >= GFX12; } + // \returns true if the target has IEEE fminimum3/fmaximum3 instructions bool hasIEEEMinMax3() const { return hasIEEEMinMax(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 909ad07782fc6..a60345bfedca9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6833,7 +6833,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, // mode functions, but this happens to be OK since it's only done in cases // where there is known no sNaN. if (IsIEEEMode) - return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); + return expandFMINNUM_FMAXNUM(Op.getNode(), DAG, + !Subtarget->hasIEEEMinNumMaxNum()); if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || VT == MVT::v16bf16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index d1a303b41deef..ed0a522f6c11d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -602,15 +602,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -757,21 +755,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index b8538cbf254fc..0d02c0d8cb464 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -602,15 +602,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -757,21 +755,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll index c7676e9da6f49..0688b5e42cc4c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll @@ -60,8 +60,6 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmin = call float @llvm.minnum.f32(float %a, float 10.0) @@ -158,8 +156,6 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp ; GFX12-NEXT: s_setpc_b64 s[30:31] %fmin = call float @llvm.minnum.f32(float %a, float 10.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll index 75c4cd53e3bfc..b0b41c1c466e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll @@ -82,8 +82,6 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call half @llvm.maxnum.f16(half %a, half 2.0) @@ -474,8 +472,6 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0) @@ -507,8 +503,6 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) @@ -541,8 +535,6 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %minnum = call float @llvm.minnum.f32(float %a, float 4.0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 38adf60888eca..6d8671c7f78a3 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1195,23 +1195,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1389,26 +1386,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1598,17 +1593,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -2002,23 +1995,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2289,23 +2279,20 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2487,46 +2474,44 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2851,39 +2836,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3203,15 +3186,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3226,30 +3209,27 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 +; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v5 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3265,14 +3245,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3280,7 +3260,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5372,23 +5352,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v2 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5694,24 +5670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6003,7 +5978,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6019,25 +5994,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v7, v8, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6053,14 +6026,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6068,7 +6041,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 2b8cea9068d87..0f40c2d563111 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1195,23 +1195,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -1389,26 +1386,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1598,17 +1593,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[13:14], v[5:6] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -2002,23 +1995,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2289,23 +2279,20 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[9:10], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN @@ -2487,46 +2474,44 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2851,39 +2836,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3203,15 +3186,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3226,30 +3209,27 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 +; GFX12-NEXT: v_min_num_f16_e32 v6, v6, v5 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3265,14 +3245,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3280,7 +3260,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5372,23 +5352,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v4, v5, v2 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -5694,24 +5670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_min_num_f16 v1, v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6003,7 +5978,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6019,25 +5994,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_min_num_f16 v7, v8, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6053,14 +6026,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6068,7 +6041,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 73ed23ab681f0..3a13eecd84781 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -413,12 +413,11 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -518,9 +517,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3157,9 +3154,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -3257,9 +3252,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 6ead5b93a0e39..9a69c254b1000 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2770,29 +2770,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2801,7 +2799,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -2810,16 +2808,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3159,7 +3155,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo @@ -3187,9 +3182,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3209,13 +3203,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3572,7 +3564,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo @@ -3600,9 +3591,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3622,13 +3612,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3986,7 +3974,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4002,20 +3989,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4024,19 +4009,17 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4371,15 +4354,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4392,20 +4374,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4413,20 +4393,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4777,15 +4755,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4798,20 +4775,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4819,20 +4794,18 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5184,29 +5157,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5215,7 +5186,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5224,16 +5195,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5607,29 +5576,27 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5638,7 +5605,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5647,16 +5614,14 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6001,9 +5966,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6017,12 +5981,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6305,11 +6268,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6321,12 +6283,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6618,11 +6579,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6634,12 +6594,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6930,9 +6889,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6945,12 +6903,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7222,37 +7178,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7526,37 +7479,34 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7832,17 +7782,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8067,24 +8015,21 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8300,11 +8245,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8316,12 +8260,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -8613,38 +8556,35 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12279,15 +12219,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12514,15 +12452,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12752,15 +12688,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13007,21 +12941,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13233,21 +13164,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13466,21 +13394,18 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13718,15 +13643,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13958,22 +13881,19 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 1fc9ed70e009c..383c1031330b9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2770,29 +2770,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2801,7 +2799,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -2810,16 +2808,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3159,7 +3155,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo @@ -3187,9 +3182,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3209,13 +3203,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3572,7 +3564,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo @@ -3600,9 +3591,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3622,13 +3612,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] -; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3986,7 +3974,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4002,20 +3989,18 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4024,19 +4009,17 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4371,15 +4354,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4392,20 +4374,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4413,20 +4393,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4777,15 +4755,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4798,20 +4775,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4819,20 +4794,18 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -5184,29 +5157,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5215,7 +5186,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5224,16 +5195,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5607,29 +5576,27 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5638,7 +5605,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -5647,16 +5614,14 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -6001,9 +5966,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6017,12 +5981,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6305,11 +6268,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6321,12 +6283,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6618,11 +6579,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6634,12 +6594,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6930,9 +6889,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6945,12 +6903,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7222,37 +7178,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7526,37 +7479,34 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7832,17 +7782,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8067,24 +8015,21 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8300,11 +8245,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8316,12 +8260,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -8613,38 +8556,35 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12279,15 +12219,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12514,15 +12452,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12752,15 +12688,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -13007,21 +12941,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13233,21 +13164,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13466,21 +13394,18 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13718,15 +13643,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13958,22 +13881,19 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 269fd52df5c49..31d3faf9ea83c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -887,18 +887,17 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s4, s6 +; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 @@ -1063,18 +1062,17 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s4, s6 +; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s8, s0 -; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s9, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e8d73914ad302..f3c08970ae2ca 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -2991,15 +2991,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3163,15 +3161,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3336,15 +3332,13 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3508,21 +3502,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3670,21 +3661,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3835,21 +3823,18 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4001,15 +3986,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4249,15 +4232,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4425,9 +4406,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4441,12 +4421,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4779,11 +4758,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4795,12 +4773,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5144,11 +5121,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5160,12 +5136,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5508,9 +5483,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5523,12 +5497,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5849,37 +5821,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6203,37 +6172,34 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6559,17 +6525,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6833,24 +6797,21 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7103,11 +7064,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7119,12 +7079,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -7468,38 +7427,35 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -11675,15 +11631,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11967,15 +11921,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12261,15 +12213,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12558,21 +12508,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12839,21 +12786,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13123,21 +13067,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13416,15 +13357,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13712,22 +13651,19 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index c1c92906df250..eca7a101bb5a2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -2991,15 +2991,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3163,15 +3161,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3336,15 +3332,13 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3508,21 +3502,18 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3670,21 +3661,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3835,21 +3823,18 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4001,15 +3986,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4249,15 +4232,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4425,9 +4406,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4441,12 +4421,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4779,11 +4758,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4795,12 +4773,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5144,11 +5121,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5160,12 +5136,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5508,9 +5483,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5523,12 +5497,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 ; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5849,37 +5821,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6203,37 +6172,34 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6559,17 +6525,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6833,24 +6797,21 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7103,11 +7064,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7119,12 +7079,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -7468,38 +7427,35 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -11675,15 +11631,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11967,15 +11921,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12261,15 +12213,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12558,21 +12508,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12839,21 +12786,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13123,21 +13067,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13416,15 +13357,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13712,22 +13651,19 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 681c07db327dc..f398497e6b28f 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -784,29 +784,28 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-NEXT: ds_load_b32 v2, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 +; GFX12-NEXT: v_not_b32_e32 v3, v3 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -814,7 +813,7 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16: @@ -1108,12 +1107,11 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1428,12 +1426,11 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1730,12 +1727,10 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2034,10 +2029,9 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v1, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2276,11 +2270,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -4514,15 +4506,13 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4785,15 +4775,13 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5056,14 +5044,11 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5317,14 +5302,11 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index bf56496e98690..dc7953567450a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -784,29 +784,28 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-NEXT: ds_load_b32 v2, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff ; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 +; GFX12-NEXT: v_not_b32_e32 v3, v3 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_mov_b32_e32 v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -814,7 +813,7 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16: @@ -1108,12 +1107,11 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1428,12 +1426,11 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1730,12 +1727,10 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 ; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2034,10 +2029,9 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v1, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2276,11 +2270,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -4514,15 +4506,13 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4785,15 +4775,13 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v2, v3, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5056,14 +5044,11 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5317,14 +5302,11 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v2, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 954dab3d0fc6f..177f98ddd0045 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -297,29 +297,15 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { ; GISEL-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GISEL-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: test_minmax_f32_ieee_true: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %minmax = call float @llvm.minnum.f32(float %max, float %c) ret float %minmax @@ -401,29 +387,15 @@ define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { ; GISEL-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; SDAG-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GISEL-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: test_maxmin_f32_ieee_true: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %maxmin = call float @llvm.maxnum.f32(float %min, float %c) ret float %maxmin @@ -638,11 +610,9 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l ; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: @@ -652,9 +622,6 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -665,10 +632,7 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true: @@ -678,9 +642,6 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) @@ -782,11 +743,9 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h -; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l ; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: @@ -796,9 +755,6 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; SDAG-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; SDAG-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -809,10 +765,7 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GISEL-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l -; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true: @@ -822,9 +775,6 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { ; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 -; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) From 85e23fe9c71f97280c804a139c3d014092b30c7f Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Wed, 19 Feb 2025 11:21:04 -0800 Subject: [PATCH 105/220] [Modules] Add `clang/Lex/HLSLRootSignatureTokenKinds.def` to clang's `modulemap` (#127839) b41b86a907f653f79bab10d4c80b3a41d146c71b added a new textual header `clang/Lex/HLSLRootSignatureTokenKinds.def` but did not add it to `clang`'s module map. This causes build failure when building llvm with `-DLLVM_ENABLE_MODULES=ON`. This PR adds the new textual header to the module map and fixes the build break. Fixing rdar://145148093. --- clang/include/module.modulemap | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index fb8e445cb4b72..8489619832a47 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -135,7 +135,16 @@ module Clang_Frontend { module Clang_FrontendTool { requires cplusplus umbrella "clang/FrontendTool" module * { export * } } module Clang_Index { requires cplusplus umbrella "clang/Index" module * { export * } } -module Clang_Lex { requires cplusplus umbrella "clang/Lex" module * { export * } } + +module Clang_Lex { + requires cplusplus + umbrella "clang/Lex" + + textual header "clang/Lex/HLSLRootSignatureTokenKinds.def" + + module * { export * } +} + module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } } module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } } module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } } From 1c762c288ff4ff613cf26833ef55100fd0da0a34 Mon Sep 17 00:00:00 2001 From: Deric Cheung Date: Wed, 19 Feb 2025 14:22:46 -0500 Subject: [PATCH 106/220] [HLSL] Implement the 'and' HLSL function (#127098) Addresses #125604 - Implements `and` as an HLSL builtin function - The `and` HLSL builtin function gets lowered to the the LLVM `and` instruction --- clang/include/clang/Basic/Builtins.td | 6 ++ clang/lib/CodeGen/CGBuiltin.cpp | 5 ++ clang/lib/Headers/hlsl/hlsl_intrinsics.h | 22 +++++++ clang/lib/Sema/SemaHLSL.cpp | 14 ++++ clang/test/CodeGenHLSL/builtins/and.hlsl | 68 ++++++++++++++++++++ clang/test/SemaHLSL/BuiltIns/and-errors.hlsl | 23 +++++++ 6 files changed, 138 insertions(+) create mode 100644 clang/test/CodeGenHLSL/builtins/and.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/and-errors.hlsl diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 0e5df338dd2e5..0e8b0189540bd 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4771,6 +4771,12 @@ def HLSLAll : LangBuiltin<"HLSL_LANG"> { let Prototype = "bool(...)"; } +def HLSLAnd : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_and"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void(...)"; +} + def HLSLAny : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_any"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4688381040be2..a73ba1ff138fb 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19506,6 +19506,11 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, CGM.getHLSLRuntime().getAllIntrinsic(), ArrayRef{Op0}, nullptr, "hlsl.all"); } + case Builtin::BI__builtin_hlsl_and: { + Value *Op0 = EmitScalarExpr(E->getArg(0)); + Value *Op1 = EmitScalarExpr(E->getArg(1)); + return Builder.CreateAnd(Op0, Op1, "hlsl.and"); + } case Builtin::BI__builtin_hlsl_any: { Value *Op0 = EmitScalarExpr(E->getArg(0)); return Builder.CreateIntrinsic( diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index d1f5fdff8b600..f03b620eee142 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -249,6 +249,28 @@ bool all(double3); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_all) bool all(double4); +//===----------------------------------------------------------------------===// +// and builtins +//===----------------------------------------------------------------------===// + +/// \fn bool and(bool x, bool y) +/// \brief Logically ands two boolean vectors elementwise and produces a bool +/// vector output. + +// TODO: Clean up clang-format marker once we've resolved +// https://github.com/llvm/llvm-project/issues/127851 +// +// clang-format off +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool and(bool x, bool y); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool2 and(bool2 x, bool2 y); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool3 and(bool3 x, bool3 y); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_and) +bool4 and(bool4 x, bool4 y); +// clang-format on + //===----------------------------------------------------------------------===// // any builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 957c3a0888438..20275ded8a561 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -2245,6 +2245,20 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { break; } + case Builtin::BI__builtin_hlsl_and: { + if (SemaRef.checkArgCount(TheCall, 2)) + return true; + if (CheckVectorElementCallArgs(&SemaRef, TheCall)) + return true; + if (CheckScalarOrVector(&SemaRef, TheCall, getASTContext().BoolTy, 0)) + return true; + + ExprResult A = TheCall->getArg(0); + QualType ArgTyA = A.get()->getType(); + // return type is the same as the input type + TheCall->setType(ArgTyA); + break; + } case Builtin::BI__builtin_hlsl_all: case Builtin::BI__builtin_hlsl_any: { if (SemaRef.checkArgCount(TheCall, 1)) diff --git a/clang/test/CodeGenHLSL/builtins/and.hlsl b/clang/test/CodeGenHLSL/builtins/and.hlsl new file mode 100644 index 0000000000000..b77889cd9ae70 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/and.hlsl @@ -0,0 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -O1 -o - | FileCheck %s + +// CHECK-LABEL: define noundef i1 @_Z15test_and_scalarbb( +// CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and i1 [[X]], [[Y]] +// CHECK-NEXT: ret i1 [[HLSL_AND]] +// +bool test_and_scalar(bool x, bool y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_( +// CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <2 x i1> [[X]], [[Y]] +// CHECK-NEXT: ret <2 x i1> [[HLSL_AND]] +// +bool2 test_and_bool2(bool2 x, bool2 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_( +// CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <3 x i1> [[X]], [[Y]] +// CHECK-NEXT: ret <3 x i1> [[HLSL_AND]] +// +bool3 test_and_bool3(bool3 x, bool3 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_( +// CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <4 x i1> [[X]], [[Y]] +// CHECK-NEXT: ret <4 x i1> [[HLSL_AND]] +// +bool4 test_and_bool4(bool4 x, bool4 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <4 x i1> @_Z13test_and_int4Dv4_iS_( +// CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer +// CHECK-NEXT: [[TOBOOL1:%.*]] = icmp ne <4 x i32> [[Y]], zeroinitializer +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <4 x i1> [[TOBOOL]], [[TOBOOL1]] +// CHECK-NEXT: ret <4 x i1> [[HLSL_AND]] +// +bool4 test_and_int4(int4 x, int4 y) { + return and(x, y); +} + +// CHECK-LABEL: define noundef <4 x i1> @_Z15test_and_float4Dv4_fS_( +// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TOBOOL:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[X]], zeroinitializer +// CHECK-NEXT: [[TOBOOL1:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[Y]], zeroinitializer +// CHECK-NEXT: [[HLSL_AND:%.*]] = and <4 x i1> [[TOBOOL]], [[TOBOOL1]] +// CHECK-NEXT: ret <4 x i1> [[HLSL_AND]] +// +bool4 test_and_float4(float4 x, float4 y) { + return and(x, y); +} diff --git a/clang/test/SemaHLSL/BuiltIns/and-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/and-errors.hlsl new file mode 100644 index 0000000000000..0a99feb023d73 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/and-errors.hlsl @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s \ +// RUN: -emit-llvm -O1 -verify + +bool test_too_few_arg(bool a) { + return __builtin_hlsl_and(a); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} +} + +bool test_too_many_arg(bool a) { + return __builtin_hlsl_and(a, a, a); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} +} + +bool2 test_mismatched_args(bool2 a, bool3 b) { + return __builtin_hlsl_and(a, b); + // expected-error@-1 {{all arguments to '__builtin_hlsl_and' must have the same type}} +} + +bool test_incorrect_type(int a) { + return __builtin_hlsl_and(a, a); + // expected-error@-1{{invalid operand of type 'int' where 'bool' or a vector of such type is required}} +} From 5caefe261fb20a70497772c24bf4e9af0ff52aef Mon Sep 17 00:00:00 2001 From: Letu Ren Date: Thu, 20 Feb 2025 03:27:28 +0800 Subject: [PATCH 107/220] [MLIR][LLVMIR] Add support for asin acos atan intrinsics op (#127317) This is similar to https://github.com/llvm/llvm-project/pull/125748 --- .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 4 ++++ mlir/test/Target/LLVMIR/Import/intrinsic.ll | 19 ++++++++++++++++++ .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 20 +++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index 72fae1bdbf461..c270b0898f865 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -170,6 +170,10 @@ def LLVM_SinOp : LLVM_UnaryIntrOpF<"sin">; def LLVM_CosOp : LLVM_UnaryIntrOpF<"cos">; def LLVM_TanOp : LLVM_UnaryIntrOpF<"tan">; +def LLVM_ASinOp : LLVM_UnaryIntrOpF<"asin">; +def LLVM_ACosOp : LLVM_UnaryIntrOpF<"acos">; +def LLVM_ATanOp : LLVM_UnaryIntrOpF<"atan">; + def LLVM_SinhOp : LLVM_UnaryIntrOpF<"sinh">; def LLVM_CoshOp : LLVM_UnaryIntrOpF<"cosh">; def LLVM_TanhOp : LLVM_UnaryIntrOpF<"tanh">; diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 249a0552c87f3..569b0def37856 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -120,6 +120,25 @@ define void @trig_test(float %0, <8 x float> %1) { ret void } +; CHECK-LABEL: llvm.func @inv_trig_test +define void @inv_trig_test(float %0, <8 x float> %1) { + ; CHECK: llvm.intr.asin(%{{.*}}) : (f32) -> f32 + %3 = call float @llvm.asin.f32(float %0) + ; CHECK: llvm.intr.asin(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %4 = call <8 x float> @llvm.asin.v8f32(<8 x float> %1) + + ; CHECK: llvm.intr.acos(%{{.*}}) : (f32) -> f32 + %5 = call float @llvm.acos.f32(float %0) + ; CHECK: llvm.intr.acos(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %6 = call <8 x float> @llvm.acos.v8f32(<8 x float> %1) + + ; CHECK: llvm.intr.atan(%{{.*}}) : (f32) -> f32 + %7 = call float @llvm.atan.f32(float %0) + ; CHECK: llvm.intr.atan(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %8 = call <8 x float> @llvm.atan.v8f32(<8 x float> %1) + + ret void +} ; CHECK-LABEL: llvm.func @hyperbolic_trig_test define void @hyperbolic_trig_test(float %0, <8 x float> %1) { ; CHECK: llvm.intr.sinh(%{{.*}}) : (f32) -> f32 diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index 2c208789e36dd..3616a2e3c7b21 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -122,6 +122,26 @@ llvm.func @trig_test(%arg0: f32, %arg1: vector<8xf32>) { llvm.return } +// CHECK-LABEL: @inv_trig_test +llvm.func @inv_trig_test(%arg0: f32, %arg1: vector<8xf32>) { + // CHECK: call float @llvm.asin.f32 + llvm.intr.asin(%arg0) : (f32) -> f32 + // CHECK: call <8 x float> @llvm.asin.v8f32 + llvm.intr.asin(%arg1) : (vector<8xf32>) -> vector<8xf32> + + // CHECK: call float @llvm.acos.f32 + llvm.intr.acos(%arg0) : (f32) -> f32 + // CHECK: call <8 x float> @llvm.acos.v8f32 + llvm.intr.acos(%arg1) : (vector<8xf32>) -> vector<8xf32> + + // CHECK: call float @llvm.atan.f32 + llvm.intr.atan(%arg0) : (f32) -> f32 + // CHECK: call <8 x float> @llvm.atan.v8f32 + llvm.intr.atan(%arg1) : (vector<8xf32>) -> vector<8xf32> + + llvm.return +} + // CHECK-LABEL: @hyperbolic_trig_test llvm.func @hyperbolic_trig_test(%arg0: f32, %arg1: vector<8xf32>) { // CHECK: call float @llvm.sinh.f32 From c3ebbfd7368ec3e4737427eef602296a868a4ecd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 19 Feb 2025 11:30:55 -0800 Subject: [PATCH 108/220] [RISCV] Add a pass to remove ADDI by reassociating to fold into load/store address. (#127151) SelectionDAG will not reassociate adds to the end of a chain if there are multiple users of later additions. This prevents isel from folding the immediate into a load/store address. One easy way to see this is accessing an array in a struct with two different indices. An ADDI will be used to get to the start of the array then 2 different SHXADD instructions will be used to add the scaled indices. Finally the SHXADD will be used by different load instructions. We can remove the ADDI by folding the offset into each load. This patch adds a new pass that analyzes how an ADDI constant propagates through address arithmetic. If the arithmetic is only used by a load/store and the offset is small enough, we can adjust the load/store offset and remove the ADDI. This pass is placed before MachineCSE to allow cleanups if some instructions become common after removing offsets from their inputs. This pass gives ~3% improvement on dynamic instruction count on 541.leela_r and 544.nab_r from SPEC2017 for the train data set. There's a ~1% improvement on 557.xz_r. --- llvm/lib/Target/RISCV/CMakeLists.txt | 1 + llvm/lib/Target/RISCV/RISCV.h | 3 + llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp | 282 +++++++ llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 + llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 + llvm/test/CodeGen/RISCV/fold-mem-offset.ll | 733 +++++++++++++++++++ llvm/test/CodeGen/RISCV/split-offsets.ll | 23 +- llvm/test/CodeGen/RISCV/xtheadmemidx.ll | 5 +- 8 files changed, 1034 insertions(+), 16 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp create mode 100644 llvm/test/CodeGen/RISCV/fold-mem-offset.ll diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 9b23a5ab521c8..5d1ea50eba494 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -37,6 +37,7 @@ add_llvm_target(RISCVCodeGen RISCVMakeCompressible.cpp RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp + RISCVFoldMemOffset.cpp RISCVFrameLowering.cpp RISCVGatherScatterLowering.cpp RISCVIndirectBranchTracking.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 851eea1352852..641e2eb4094f9 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -52,6 +52,9 @@ void initializeRISCVVectorPeepholePass(PassRegistry &); FunctionPass *createRISCVOptWInstrsPass(); void initializeRISCVOptWInstrsPass(PassRegistry &); +FunctionPass *createRISCVFoldMemOffsetPass(); +void initializeRISCVFoldMemOffsetPass(PassRegistry &); + FunctionPass *createRISCVMergeBaseOffsetOptPass(); void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp new file mode 100644 index 0000000000000..989e9d859d64f --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp @@ -0,0 +1,282 @@ +//===- RISCVFoldMemOffset.cpp - Fold ADDI into memory offsets ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// Look for ADDIs that can be removed by folding their immediate into later +// load/store addresses. There may be other arithmetic instructions between the +// addi and load/store that we need to reassociate through. If the final result +// of the arithmetic is only used by load/store addresses, we can fold the +// offset into the all the load/store as long as it doesn't create an offset +// that is too large. +// +//===---------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "riscv-fold-mem-offset" +#define RISCV_FOLD_MEM_OFFSET_NAME "RISC-V Fold Memory Offset" + +namespace { + +class RISCVFoldMemOffset : public MachineFunctionPass { +public: + static char ID; + + RISCVFoldMemOffset() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool foldOffset(Register OrigReg, int64_t InitialOffset, + const MachineRegisterInfo &MRI, + DenseMap &FoldableInstrs); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return RISCV_FOLD_MEM_OFFSET_NAME; } +}; + +// Wrapper class around a std::optional to allow accumulation. +class FoldableOffset { + std::optional Offset; + +public: + bool hasValue() const { return Offset.has_value(); } + int64_t getValue() const { return *Offset; } + + FoldableOffset &operator=(int64_t RHS) { + Offset = RHS; + return *this; + } + + FoldableOffset &operator+=(int64_t RHS) { + if (!Offset) + Offset = 0; + Offset = (uint64_t)*Offset + (uint64_t)RHS; + return *this; + } + + int64_t operator*() { return *Offset; } +}; + +} // end anonymous namespace + +char RISCVFoldMemOffset::ID = 0; +INITIALIZE_PASS(RISCVFoldMemOffset, DEBUG_TYPE, RISCV_FOLD_MEM_OFFSET_NAME, + false, false) + +FunctionPass *llvm::createRISCVFoldMemOffsetPass() { + return new RISCVFoldMemOffset(); +} + +// Walk forward from the ADDI looking for arithmetic instructions we can +// analyze or memory instructions that use it as part of their address +// calculation. For each arithmetic instruction we lookup how the offset +// contributes to the value in that register use that information to +// calculate the contribution to the output of this instruction. +// Only addition and left shift are supported. +// FIXME: Add multiplication by constant. The constant will be in a register. +bool RISCVFoldMemOffset::foldOffset( + Register OrigReg, int64_t InitialOffset, const MachineRegisterInfo &MRI, + DenseMap &FoldableInstrs) { + // Map to hold how much the offset contributes to the value of this register. + DenseMap RegToOffsetMap; + + // Insert root offset into the map. + RegToOffsetMap[OrigReg] = InitialOffset; + + std::queue Worklist; + Worklist.push(OrigReg); + + while (!Worklist.empty()) { + Register Reg = Worklist.front(); + Worklist.pop(); + + if (!Reg.isVirtual()) + return false; + + for (auto &User : MRI.use_nodbg_instructions(Reg)) { + FoldableOffset Offset; + + switch (User.getOpcode()) { + default: + return false; + case RISCV::ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = I->second; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH1ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 1; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH2ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 2; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH3ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 3; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::ADD_UW: + case RISCV::SH1ADD_UW: + case RISCV::SH2ADD_UW: + case RISCV::SH3ADD_UW: + // Don't fold through the zero extended input. + if (User.getOperand(1).getReg() == Reg) + return false; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset = I->second; + break; + case RISCV::SLLI: { + unsigned ShAmt = User.getOperand(2).getImm(); + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << ShAmt; + break; + } + case RISCV::LB: + case RISCV::LBU: + case RISCV::SB: + case RISCV::LH: + case RISCV::LH_INX: + case RISCV::LHU: + case RISCV::FLH: + case RISCV::SH: + case RISCV::SH_INX: + case RISCV::FSH: + case RISCV::LW: + case RISCV::LW_INX: + case RISCV::LWU: + case RISCV::FLW: + case RISCV::SW: + case RISCV::SW_INX: + case RISCV::FSW: + case RISCV::LD: + case RISCV::FLD: + case RISCV::SD: + case RISCV::FSD: { + // Can't fold into store value. + if (User.getOperand(0).getReg() == Reg) + return false; + + // Existing offset must be immediate. + if (!User.getOperand(2).isImm()) + return false; + + // Require at least one operation between the ADDI and the load/store. + // We have other optimizations that should handle the simple case. + if (User.getOperand(1).getReg() == OrigReg) + return false; + + auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + if (I == RegToOffsetMap.end()) + return false; + + int64_t LocalOffset = User.getOperand(2).getImm(); + assert(isInt<12>(LocalOffset)); + int64_t CombinedOffset = (uint64_t)LocalOffset + (uint64_t)I->second; + if (!isInt<12>(CombinedOffset)) + return false; + + FoldableInstrs[&User] = CombinedOffset; + continue; + } + } + + // If we reach here we should have an accumulated offset. + assert(Offset.hasValue() && "Expected an offset"); + + // If the offset is new or changed, add the destination register to the + // work list. + int64_t OffsetVal = Offset.getValue(); + auto P = + RegToOffsetMap.try_emplace(User.getOperand(0).getReg(), OffsetVal); + if (P.second) { + Worklist.push(User.getOperand(0).getReg()); + } else if (P.first->second != OffsetVal) { + P.first->second = OffsetVal; + Worklist.push(User.getOperand(0).getReg()); + } + } + } + + return true; +} + +bool RISCVFoldMemOffset::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + // This optimization may increase size by preventing compression. + if (MF.getFunction().hasOptSize()) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + // FIXME: We can support ADDIW from an LUI+ADDIW pair if the result is + // equivalent to LUI+ADDI. + if (MI.getOpcode() != RISCV::ADDI) + continue; + + // We only want to optimize register ADDIs. + if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) + continue; + + // Ignore 'li'. + if (MI.getOperand(1).getReg() == RISCV::X0) + continue; + + int64_t Offset = MI.getOperand(2).getImm(); + assert(isInt<12>(Offset)); + + DenseMap FoldableInstrs; + + if (!foldOffset(MI.getOperand(0).getReg(), Offset, MRI, FoldableInstrs)) + continue; + + if (FoldableInstrs.empty()) + continue; + + // We can fold this ADDI. + // Rewrite all the instructions. + for (auto [MemMI, NewOffset] : FoldableInstrs) + MemMI->getOperand(2).setImm(NewOffset); + + MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MI.eraseFromParent(); + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 167dbb53c5950..89e017807363b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -133,6 +133,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVPostRAExpandPseudoPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVOptWInstrsPass(*PR); + initializeRISCVFoldMemOffsetPass(*PR); initializeRISCVPreRAExpandPseudoPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVVectorPeepholePass(*PR); @@ -590,6 +591,7 @@ void RISCVPassConfig::addMachineSSAOptimization() { addPass(createRISCVVectorPeepholePass()); // TODO: Move this to pre regalloc addPass(createRISCVVMV0EliminationPass()); + addPass(createRISCVFoldMemOffsetPass()); TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 2646dfeca4eb6..194223eee69eb 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -98,6 +98,7 @@ ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: RISC-V Vector Peephole Optimization ; CHECK-NEXT: RISC-V VMV0 Elimination +; CHECK-NEXT: RISC-V Fold Memory Offset ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll new file mode 100644 index 0000000000000..b12fa509b0bea --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll @@ -0,0 +1,733 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 | FileCheck %s --check-prefixes=CHECK,RV32I +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 | FileCheck %s --check-prefixes=CHECK,RV64I +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV32ZBA +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV64ZBA + +define i64 @test_sh3add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh3add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a1, a1, a0 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a2, 480(a1) +; RV32I-NEXT: lw a1, 484(a1) +; RV32I-NEXT: lw a3, 400(a0) +; RV32I-NEXT: lw a0, 404(a0) +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a3, a2 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh3add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a1, a1, a0 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a1, 480(a1) +; RV64I-NEXT: ld a0, 400(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh3add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh3add a1, a1, a0 +; RV32ZBA-NEXT: sh3add a0, a2, a0 +; RV32ZBA-NEXT: lw a2, 480(a1) +; RV32ZBA-NEXT: lw a1, 484(a1) +; RV32ZBA-NEXT: lw a3, 400(a0) +; RV32ZBA-NEXT: lw a0, 404(a0) +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: add a0, a3, a2 +; RV32ZBA-NEXT: sltu a2, a0, a3 +; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh3add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh3add a1, a1, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: ld a1, 480(a1) +; RV64ZBA-NEXT: ld a0, 400(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %b = getelementptr inbounds nuw i8, ptr %p, i64 400 + %add = add iXLen %x, 10 + %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %add + %0 = load i64, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %y + %1 = load i64, ptr %arrayidx2, align 8 + %add3 = add nsw i64 %1, %0 + ret i64 %add3 +} + +define signext i32 @test_sh2add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh2add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, 1200(a1) +; RV32I-NEXT: lw a0, 1240(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh2add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 1200(a1) +; RV64I-NEXT: lw a0, 1240(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh2add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 1200(a1) +; RV32ZBA-NEXT: lw a0, 1240(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh2add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 1200(a1) +; RV64ZBA-NEXT: lw a0, 1240(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +define signext i16 @test_sh1add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh1add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lh a1, 1600(a1) +; RV32I-NEXT: lh a0, 1620(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh1add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lh a1, 1600(a1) +; RV64I-NEXT: lh a0, 1620(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh1add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh1add a1, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a2, a0 +; RV32ZBA-NEXT: lh a1, 1600(a1) +; RV32ZBA-NEXT: lh a0, 1620(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: slli a0, a0, 16 +; RV32ZBA-NEXT: srai a0, a0, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh1add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add a1, a1, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: lh a1, 1600(a1) +; RV64ZBA-NEXT: lh a0, 1620(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: slli a0, a0, 48 +; RV64ZBA-NEXT: srai a0, a0, 48 +; RV64ZBA-NEXT: ret +entry: + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %x + %0 = load i16, ptr %arrayidx, align 2 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %add + %1 = load i16, ptr %arrayidx2, align 2 + %add4 = add i16 %1, %0 + ret i16 %add4 +} + +define zeroext i8 @test_add(ptr %p, iXLen %x, iXLen %y) { +; CHECK-LABEL: test_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 1800(a1) +; CHECK-NEXT: lbu a0, 1810(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_add: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 1800(a1) +; ZBA-NEXT: lbu a0, 1810(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} + +define i64 @test_sh3add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh3add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a2, 400(a1) +; RV32I-NEXT: lw a1, 404(a1) +; RV32I-NEXT: lw a3, 400(a0) +; RV32I-NEXT: lw a0, 404(a0) +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a3, a2 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh3add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 29 +; RV64I-NEXT: srli a2, a2, 29 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a1, 400(a1) +; RV64I-NEXT: ld a0, 400(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh3add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh3add a1, a1, a0 +; RV32ZBA-NEXT: sh3add a0, a2, a0 +; RV32ZBA-NEXT: lw a2, 400(a1) +; RV32ZBA-NEXT: lw a1, 404(a1) +; RV32ZBA-NEXT: lw a3, 400(a0) +; RV32ZBA-NEXT: lw a0, 404(a0) +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: add a0, a3, a2 +; RV32ZBA-NEXT: sltu a2, a0, a3 +; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh3add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh3add.uw a1, a1, a0 +; RV64ZBA-NEXT: sh3add.uw a0, a2, a0 +; RV64ZBA-NEXT: ld a1, 400(a1) +; RV64ZBA-NEXT: ld a0, 400(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %b = getelementptr inbounds nuw i8, ptr %p, i64 400 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom + %0 = load i64, ptr %arrayidx, align 8 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom2 + %1 = load i64, ptr %arrayidx3, align 8 + %add4 = add nsw i64 %1, %0 + ret i64 %add4 +} + +define signext i32 @test_sh2add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh2add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a1, 1200(a1) +; RV32I-NEXT: lw a0, 1200(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh2add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 30 +; RV64I-NEXT: srli a2, a2, 30 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a1, 1200(a1) +; RV64I-NEXT: lw a0, 1200(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh2add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 1200(a1) +; RV32ZBA-NEXT: lw a0, 1200(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh2add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add.uw a1, a1, a0 +; RV64ZBA-NEXT: sh2add.uw a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 1200(a1) +; RV64ZBA-NEXT: lw a0, 1200(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom2 + %1 = load i32, ptr %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + ret i32 %add4 +} + +define signext i16 @test_sh1add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh1add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lh a1, 1600(a1) +; RV32I-NEXT: lh a0, 1620(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh1add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: addi a2, a2, 10 +; RV64I-NEXT: srli a1, a1, 31 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: srli a2, a2, 31 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a1, 1600(a1) +; RV64I-NEXT: lh a0, 1600(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh1add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh1add a1, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a2, a0 +; RV32ZBA-NEXT: lh a1, 1600(a1) +; RV32ZBA-NEXT: lh a0, 1620(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: slli a0, a0, 16 +; RV32ZBA-NEXT: srai a0, a0, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh1add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add.uw a1, a1, a0 +; RV64ZBA-NEXT: addi a2, a2, 10 +; RV64ZBA-NEXT: sh1add.uw a0, a2, a0 +; RV64ZBA-NEXT: lh a1, 1600(a1) +; RV64ZBA-NEXT: lh a0, 1600(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: slli a0, a0, 48 +; RV64ZBA-NEXT: srai a0, a0, 48 +; RV64ZBA-NEXT: ret +entry: + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom + %0 = load i16, ptr %arrayidx, align 2 + %add = add i32 %y, 10 + %idxprom2 = zext i32 %add to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom2 + %1 = load i16, ptr %arrayidx3, align 2 + %add5 = add i16 %1, %0 + ret i16 %add5 +} + +define zeroext i8 @test_add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lbu a1, 1800(a1) +; RV32I-NEXT: lbu a0, 1800(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: andi a0, a0, 255 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lbu a1, 1800(a1) +; RV64I-NEXT: lbu a0, 1800(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: andi a0, a0, 255 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: lbu a1, 1800(a1) +; RV32ZBA-NEXT: lbu a0, 1800(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: andi a0, a0, 255 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add.uw a1, a1, a0 +; RV64ZBA-NEXT: add.uw a0, a2, a0 +; RV64ZBA-NEXT: lbu a1, 1800(a1) +; RV64ZBA-NEXT: lbu a0, 1800(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: andi a0, a0, 255 +; RV64ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom + %0 = load i8, ptr %arrayidx, align 1 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom2 + %1 = load i8, ptr %arrayidx3, align 1 + %add5 = add i8 %1, %0 + ret i8 %add5 +} + +; The addi is part of the index and used with 2 different scales. +define signext i32 @test_scaled_index_addi(ptr %p, iXLen %x) { +; RV32I-LABEL: test_scaled_index_addi: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a2, a1, 2 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 1196(a2) +; RV32I-NEXT: lh a0, 1598(a0) +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_scaled_index_addi: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a1, 1196(a2) +; RV64I-NEXT: lh a0, 1598(a0) +; RV64I-NEXT: addw a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_scaled_index_addi: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a2, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a1, a0 +; RV32ZBA-NEXT: lw a1, 1196(a2) +; RV32ZBA-NEXT: lh a0, 1598(a0) +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_scaled_index_addi: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a2, a1, a0 +; RV64ZBA-NEXT: sh1add a0, a1, a0 +; RV64ZBA-NEXT: lw a1, 1196(a2) +; RV64ZBA-NEXT: lh a0, 1598(a0) +; RV64ZBA-NEXT: addw a0, a1, a0 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %sub = add iXLen %x, -1 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %sub + %0 = load i32, ptr %arrayidx, align 4 + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %sub + %1 = load i16, ptr %arrayidx2, align 2 + %conv = sext i16 %1 to i32 + %add = add nsw i32 %0, %conv + ret i32 %add +} + +; Offset is a pair of addis. We can fold one of them. +define signext i32 @test_medium_offset(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_medium_offset: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, 753(a1) +; RV32I-NEXT: lw a0, 793(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_medium_offset: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 753(a1) +; RV64I-NEXT: lw a0, 793(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_medium_offset: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a0, a0, 2047 +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 753(a1) +; RV32ZBA-NEXT: lw a0, 793(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_medium_offset: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a0, a0, 2047 +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 753(a1) +; RV64ZBA-NEXT: lw a0, 793(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +; Offset is a lui+addiw. We can't fold this on RV64. +define signext i32 @test_large_offset(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_large_offset: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a3, 2 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, -1392(a1) +; RV32I-NEXT: lw a0, -1352(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_large_offset: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a3, 2 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: addiw a3, a3, -1392 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 0(a1) +; RV64I-NEXT: lw a0, 40(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_large_offset: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: li a3, 1700 +; RV32ZBA-NEXT: sh2add a0, a3, a0 +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 0(a1) +; RV32ZBA-NEXT: lw a0, 40(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_large_offset: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a3, 1700 +; RV64ZBA-NEXT: sh2add a0, a3, a0 +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 0(a1) +; RV64ZBA-NEXT: lw a0, 40(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %g = getelementptr inbounds nuw i8, ptr %p, i64 6800 + %arrayidx = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +; After folding we can CSE the sh2add +define signext i32 @test_cse(ptr %p, iXLen %x) { +; RV32I-LABEL: test_cse: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 1200(a0) +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: lw a0, 753(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_cse: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a1, 1200(a0) +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: lw a0, 753(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_cse: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a0, a1, a0 +; RV32ZBA-NEXT: lw a1, 1200(a0) +; RV32ZBA-NEXT: addi a0, a0, 2047 +; RV32ZBA-NEXT: lw a0, 753(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_cse: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a0, a1, a0 +; RV64ZBA-NEXT: lw a1, 1200(a0) +; RV64ZBA-NEXT: addi a0, a0, 2047 +; RV64ZBA-NEXT: lw a0, 753(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 + %arrayidx1 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x + %1 = load i32, ptr %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + ret i32 %add +} + +define zeroext i8 @test_optsize(ptr %p, iXLen %x, iXLen %y) optsize { +; CHECK-LABEL: test_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, a0, 1800 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: lbu a0, 10(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_optsize: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: addi a0, a0, 1800 +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 0(a1) +; ZBA-NEXT: lbu a0, 10(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} + +define zeroext i8 @test_minsize(ptr %p, iXLen %x, iXLen %y) minsize { +; CHECK-LABEL: test_minsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, a0, 1800 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: lbu a0, 10(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_minsize: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: addi a0, a0, 1800 +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 0(a1) +; ZBA-NEXT: lbu a0, 10(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index 8f5b044c3b3b8..b98aa954c09e7 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -14,14 +14,13 @@ define void @test1(ptr %sp, ptr %t, i32 %n) { ; RV32I-NEXT: lui a2, 20 ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: li a3, 2 -; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a2, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) +; RV32I-NEXT: sw a3, -1920(a0) +; RV32I-NEXT: sw a2, -1916(a0) +; RV32I-NEXT: sw a2, -1920(a1) +; RV32I-NEXT: sw a3, -1916(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: @@ -58,17 +57,16 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a4, 20 -; RV32I-NEXT: addi a4, a4, -1920 ; RV32I-NEXT: add a1, a1, a4 ; RV32I-NEXT: add a0, a0, a4 ; RV32I-NEXT: blez a2, .LBB1_2 ; RV32I-NEXT: .LBB1_1: # %while_body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: addi a4, a3, 1 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a4, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) +; RV32I-NEXT: sw a4, -1920(a0) +; RV32I-NEXT: sw a3, -1916(a0) +; RV32I-NEXT: sw a4, -1920(a1) +; RV32I-NEXT: sw a3, -1916(a1) ; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: blt a4, a2, .LBB1_1 ; RV32I-NEXT: .LBB1_2: # %while_end @@ -126,11 +124,10 @@ define void @test3(ptr %t) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, 20 ; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: addi a1, a1, -1920 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a2, -1916(a0) +; RV32I-NEXT: sw a1, -1912(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test3: diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index e761fcb736a87..578f51a957a75 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -1136,10 +1136,9 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a0, a1, a0 ; RV32XTHEADMEMIDX-NEXT: lui a1, 23 -; RV32XTHEADMEMIDX-NEXT: addi a1, a1, 1792 ; RV32XTHEADMEMIDX-NEXT: add a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: lw a0, 0(a1) -; RV32XTHEADMEMIDX-NEXT: lw a1, 4(a1) +; RV32XTHEADMEMIDX-NEXT: lw a0, 1792(a1) +; RV32XTHEADMEMIDX-NEXT: lw a1, 1796(a1) ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lrd_large_offset: From b1f882f86a5ba87ac77ed0f31e06e77a34f8303b Mon Sep 17 00:00:00 2001 From: Jonathan Peyton Date: Wed, 19 Feb 2025 13:31:40 -0600 Subject: [PATCH 109/220] [OpenMP][NFC] Remove unused clock function types and globals (#127684) --- openmp/runtime/src/kmp.h | 12 ------------ openmp/runtime/src/kmp_global.cpp | 5 ----- 2 files changed, 17 deletions(-) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 04bf6c3b34dac..7b565d12a9308 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -521,13 +521,6 @@ enum library_type { library_throughput }; -#if KMP_OS_LINUX -enum clock_function_type { - clock_function_gettimeofday, - clock_function_clock_gettime -}; -#endif /* KMP_OS_LINUX */ - #if KMP_MIC_SUPPORTED enum mic_type { non_mic, mic1, mic2, mic3, dummy }; #endif @@ -3545,11 +3538,6 @@ extern int __kmp_hot_teams_mode; extern int __kmp_hot_teams_max_level; #endif -#if KMP_OS_LINUX -extern enum clock_function_type __kmp_clock_function; -extern int __kmp_clock_function_param; -#endif /* KMP_OS_LINUX */ - #if KMP_MIC_SUPPORTED extern enum mic_type __kmp_mic_type; #endif diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 5017cd3de4be5..ce803bb0d6079 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -242,11 +242,6 @@ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + // of public intel extension schedules }; -#if KMP_OS_LINUX -enum clock_function_type __kmp_clock_function; -int __kmp_clock_function_param; -#endif /* KMP_OS_LINUX */ - #if KMP_MIC_SUPPORTED enum mic_type __kmp_mic_type = non_mic; #endif From 851177c2e35e17d5bca68521a473f0dad1ad29ec Mon Sep 17 00:00:00 2001 From: Jonathan Peyton Date: Wed, 19 Feb 2025 13:32:00 -0600 Subject: [PATCH 110/220] [OpenMP][NFC] Remove unused __kmp_dispatch_lock global (#127686) --- openmp/runtime/src/kmp.h | 1 - openmp/runtime/src/kmp_global.cpp | 4 ---- openmp/runtime/src/kmp_runtime.cpp | 1 - 3 files changed, 6 deletions(-) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 7b565d12a9308..b002b29e13747 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -3408,7 +3408,6 @@ extern kmp_bootstrap_lock_t __kmp_threads expansion to co-exist */ extern kmp_lock_t __kmp_global_lock; /* control OS/global access */ -extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access */ extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */ extern enum library_type __kmp_library; diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index ce803bb0d6079..c6446bdb90f63 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -488,8 +488,6 @@ KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock); KMP_ALIGN_CACHE_INTERNODE KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */ KMP_ALIGN_CACHE_INTERNODE -kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access */ -KMP_ALIGN_CACHE_INTERNODE KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */ #else KMP_ALIGN_CACHE @@ -508,8 +506,6 @@ KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock); KMP_ALIGN(128) KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */ KMP_ALIGN(128) -kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access */ -KMP_ALIGN(128) KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */ #endif diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 2c8d9304c46bc..c42c89e106690 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -7143,7 +7143,6 @@ static void __kmp_do_serial_initialize(void) { __kmp_stats_init(); #endif __kmp_init_lock(&__kmp_global_lock); - __kmp_init_queuing_lock(&__kmp_dispatch_lock); __kmp_init_lock(&__kmp_debug_lock); __kmp_init_atomic_lock(&__kmp_atomic_lock); __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); From dca73063653ca7d35afb3226ae66623495086204 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Wed, 19 Feb 2025 11:36:09 -0800 Subject: [PATCH 111/220] [clang][perf-training] Support excluding LLVM build from PGO training (#126876) Using LLVM build itself for PGO training is convenient and a great starting point but it also has several issues: * LLVM build implicitly depends on tools other than CMake and C/C++ compiler and if those tools aren't available in PATH, the build will fail. * LLVM build also requires standard headers and libraries which may not always be available in the default location requiring an explicit sysroot. * Building a single configuration (-DCMAKE_BUILD_TYPE=Release) only exercises the -O3 pipeline and can pesimize other configurations. * Building for the host target doesn't exercise all other targets. * Since LLVMSupport is a static library, this doesn't exercise the linker (beyond what the CMake itself does). Rather than using LLVM build, ideally we would provide a more minimal, purpose built corpus. While we're working on building such a corpus, provide a CMake option that lets vendors disable the use LLVM build for PGO training. --- clang/utils/perf-training/CMakeLists.txt | 6 ++++++ clang/utils/perf-training/lit.cfg | 3 +++ clang/utils/perf-training/lit.site.cfg.in | 1 + 3 files changed, 10 insertions(+) diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt index 4aed086563ee9..0c1cdd9a1fb60 100644 --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -6,6 +6,12 @@ set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data") set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.") +option(CLANG_PGO_TRAINING_USE_LLVM_BUILD "Use LLVM build for generating PGO data" ON) + +llvm_canonicalize_cmake_booleans( + CLANG_PGO_TRAINING_USE_LLVM +) + if(LLVM_BUILD_INSTRUMENTED) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in diff --git a/clang/utils/perf-training/lit.cfg b/clang/utils/perf-training/lit.cfg index adefc7893ac44..3f6089b7139a7 100644 --- a/clang/utils/perf-training/lit.cfg +++ b/clang/utils/perf-training/lit.cfg @@ -27,6 +27,9 @@ config.clang = lit.util.which('clang', config.clang_tools_dir).replace('\\', '/' config.name = 'Clang Perf Training' config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test'] +if not config.use_llvm_build: + config.excludes = ['llvm-support'] + cc1_wrapper = '%s %s/perf-helper.py cc1' % (config.python_exe, config.perf_helper_dir) use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") diff --git a/clang/utils/perf-training/lit.site.cfg.in b/clang/utils/perf-training/lit.site.cfg.in index 9d279d552919a..da81ec21a28a6 100644 --- a/clang/utils/perf-training/lit.site.cfg.in +++ b/clang/utils/perf-training/lit.site.cfg.in @@ -11,6 +11,7 @@ config.python_exe = "@Python3_EXECUTABLE@" config.cmake_exe = "@CMAKE_COMMAND@" config.llvm_src_dir ="@CMAKE_SOURCE_DIR@" config.cmake_generator ="@CMAKE_GENERATOR@" +config.use_llvm_build = @CLANG_PGO_TRAINING_USE_LLVM_BUILD@ # Let the main config do the real work. lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/lit.cfg") From 4a411eb4ee673e2687d38fda16d6db6b907f37d2 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 19 Feb 2025 14:52:02 -0500 Subject: [PATCH 112/220] [MLIR] Fix rewrite of ops with vector operands to LLVM on GPU (#127844) There was a discrepancy between the type-converter and rewrite-pattern parts of conversion to LLVM used in various GPU targets, at least ROCDL and NVVM: - The TypeConverter part was handling vectors of arbitrary rank, converting them to nests of `!llvm.array< ... >` with a vector at the inner-most dimension: https://github.com/llvm/llvm-project/blob/8337d01e3058e7f47675f5b2b908b4e7821895d7/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp#L629-L655 - The rewrite pattern part was not handling `llvm.array`: https://github.com/llvm/llvm-project/blob/8337d01e3058e7f47675f5b2b908b4e7821895d7/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp#L594-L596 That led to conversion failures when lowering `math` dialect ops on rank-2 vectors, as in the testcase being added in this PR. This PR fixes this by reusing a shared utility already used in other conversions to LLVM: https://github.com/llvm/llvm-project/blob/8337d01e3058e7f47675f5b2b908b4e7821895d7/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp#L80-L104 --------- Signed-off-by: Benoit Jacob --- .../Conversion/GPUCommon/GPUOpsLowering.cpp | 51 ++++++++++++------- .../lib/Conversion/GPUCommon/GPUOpsLowering.h | 5 +- .../Conversion/MathToROCDL/math-to-rocdl.mlir | 51 +++++++++++++++++++ 3 files changed, 88 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index cfa434699cdef..c3b3a78abe7f7 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -9,6 +9,7 @@ #include "GPUOpsLowering.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/LLVMCommon/VectorPattern.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -586,22 +587,15 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite( return success(); } -/// Unrolls op if it's operating on vectors. -LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands, - ConversionPatternRewriter &rewriter, - const LLVMTypeConverter &converter) { +/// Helper for impl::scalarizeVectorOp. Scalarizes vectors to elements. +/// Used either directly (for ops on 1D vectors) or as the callback passed to +/// detail::handleMultidimensionalVectors (for ops on higher-rank vectors). +static Value scalarizeVectorOpHelper(Operation *op, ValueRange operands, + Type llvm1DVectorTy, + ConversionPatternRewriter &rewriter, + const LLVMTypeConverter &converter) { TypeRange operandTypes(operands); - if (llvm::none_of(operandTypes, llvm::IsaPred)) { - return rewriter.notifyMatchFailure(op, "expected vector operand"); - } - if (op->getNumRegions() != 0 || op->getNumSuccessors() != 0) - return rewriter.notifyMatchFailure(op, "expected no region/successor"); - if (op->getNumResults() != 1) - return rewriter.notifyMatchFailure(op, "expected single result"); - VectorType vectorType = dyn_cast(op->getResult(0).getType()); - if (!vectorType) - return rewriter.notifyMatchFailure(op, "expected vector result"); - + VectorType vectorType = cast(llvm1DVectorTy); Location loc = op->getLoc(); Value result = rewriter.create(loc, vectorType); Type indexType = converter.convertType(rewriter.getIndexType()); @@ -621,9 +615,32 @@ LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands, result = rewriter.create( loc, result, scalarOp->getResult(0), index); } + return result; +} - rewriter.replaceOp(op, result); - return success(); +/// Unrolls op to array/vector elements. +LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands, + ConversionPatternRewriter &rewriter, + const LLVMTypeConverter &converter) { + TypeRange operandTypes(operands); + if (llvm::any_of(operandTypes, llvm::IsaPred)) { + VectorType vectorType = cast(op->getResultTypes()[0]); + rewriter.replaceOp(op, scalarizeVectorOpHelper(op, operands, vectorType, + rewriter, converter)); + return success(); + } + + if (llvm::any_of(operandTypes, llvm::IsaPred)) { + return LLVM::detail::handleMultidimensionalVectors( + op, operands, converter, + [&](Type llvm1DVectorTy, ValueRange operands) -> Value { + return scalarizeVectorOpHelper(op, operands, llvm1DVectorTy, rewriter, + converter); + }, + rewriter); + } + + return rewriter.notifyMatchFailure(op, "no llvm.array or vector to unroll"); } static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) { diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index e73a74845d2b6..bd2fd020f684b 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -172,13 +172,13 @@ struct GPUReturnOpLowering : public ConvertOpToLLVMPattern { }; namespace impl { -/// Unrolls op if it's operating on vectors. +/// Unrolls op to array/vector elements. LogicalResult scalarizeVectorOp(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter, const LLVMTypeConverter &converter); } // namespace impl -/// Rewriting that unrolls SourceOp to scalars if it's operating on vectors. +/// Unrolls SourceOp to array/vector elements. template struct ScalarizeVectorOpLowering : public ConvertOpToLLVMPattern { public: @@ -191,6 +191,7 @@ struct ScalarizeVectorOpLowering : public ConvertOpToLLVMPattern { *this->getTypeConverter()); } }; + } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir index e4b2f01d6544a..9448304f11dbd 100644 --- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir +++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir @@ -513,3 +513,54 @@ module { "test.possible_terminator"() : () -> () }) : () -> () } + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 + // CHECK-LABEL: func @math_sin_vector_1d + func.func @math_sin_vector_1d(%arg : vector<4xf16>) -> vector<4xf16> { + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + // CHECK: llvm.extractelement {{.*}} : vector<4xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<4xf16> + %result = math.sin %arg : vector<4xf16> + func.return %result : vector<4xf16> + } +} + +// ----- + +module @test_module { + // CHECK: llvm.func @__ocml_sin_f16(f16) -> f16 + // CHECK-LABEL: func @math_sin_vector_2d + func.func @math_sin_vector_2d(%arg : vector<2x2xf16>) -> vector<2x2xf16> { + // CHECK: builtin.unrealized_conversion_cast {{.*}} : vector<2x2xf16> to !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.insertvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.extractelement {{.*}} : vector<2xf16> + // CHECK: llvm.call @__ocml_sin_f16(%{{.*}}) : (f16) -> f16 + // CHECK: llvm.insertelement {{.*}} : vector<2xf16> + // CHECK: llvm.insertvalue {{.*}} : !llvm.array<2 x vector<2xf16>> + %result = math.sin %arg : vector<2x2xf16> + func.return %result : vector<2x2xf16> + } +} From 1987f93d03cec41f2599752cb63c9d130b901de3 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 19 Feb 2025 11:54:44 -0800 Subject: [PATCH 113/220] [SandboxIR] OpaqueValue (#127699) This patch implements a new subclass of the Value class used for Sandbox IR Values that we don't support, like metadata or inline asm. The goal is to never have null sandboxir::Value objects, because this is not the expected behavior. --- llvm/include/llvm/SandboxIR/Value.h | 23 ++++++++++++++++++++++ llvm/include/llvm/SandboxIR/Values.def | 1 + llvm/lib/SandboxIR/BasicBlock.cpp | 6 ------ llvm/lib/SandboxIR/Context.cpp | 10 ++++++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 23 ++++++++++++++++++++++ 5 files changed, 57 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index 28e33ca0f2312..2e91b96bb22e6 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -9,6 +9,7 @@ #ifndef LLVM_SANDBOXIR_VALUE_H #define LLVM_SANDBOXIR_VALUE_H +#include "llvm/IR/Metadata.h" #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Use.h" @@ -282,6 +283,28 @@ class Value { #endif }; +class OpaqueValue : public Value { +protected: + OpaqueValue(llvm::Value *V, Context &Ctx) + : Value(ClassID::OpaqueValue, V, Ctx) {} + friend class Context; // For constructor. + +public: + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::OpaqueValue; + } +#ifndef NDEBUG + void verify() const override { + assert((isa(Val) || isa(Val)) && + "Expected Metadata or InlineAssembly!"); + } + void dumpOS(raw_ostream &OS) const override { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); + } +#endif // NDEBUG +}; + } // namespace llvm::sandboxir #endif // LLVM_SANDBOXIR_VALUE_H diff --git a/llvm/include/llvm/SandboxIR/Values.def b/llvm/include/llvm/SandboxIR/Values.def index 3d8ad6ce197f4..f5ead54a08e10 100644 --- a/llvm/include/llvm/SandboxIR/Values.def +++ b/llvm/include/llvm/SandboxIR/Values.def @@ -21,6 +21,7 @@ DEF_CONST(Function, Function) DEF_VALUE(Argument, Argument) +DEF_VALUE(OpaqueValue, OpaqueValue) DEF_USER(User, User) DEF_VALUE(Block, BasicBlock) diff --git a/llvm/lib/SandboxIR/BasicBlock.cpp b/llvm/lib/SandboxIR/BasicBlock.cpp index 983a5e8b8825e..b45c046402487 100644 --- a/llvm/lib/SandboxIR/BasicBlock.cpp +++ b/llvm/lib/SandboxIR/BasicBlock.cpp @@ -67,12 +67,6 @@ void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) { // Skip instruction's label operands if (isa(Op)) continue; - // Skip metadata - if (isa(Op)) - continue; - // Skip asm - if (isa(Op)) - continue; Ctx.getOrCreateValue(Op); } } diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index 6a397b02d6bde..38ca60cfac3ec 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Context.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" #include "llvm/SandboxIR/Module.h" @@ -169,6 +170,15 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { return SBBB; return nullptr; } + // TODO: Move these checks after more common Values, like after Instruction. + if (auto *MD = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new OpaqueValue(MD, *this)); + return It->second.get(); + } + if (auto *Asm = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new OpaqueValue(Asm, *this)); + return It->second.get(); + } assert(isa(LLVMV) && "Expected Instruction"); switch (cast(LLVMV)->getOpcode()) { diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 2ad33659c609b..088264e0429fd 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -6166,3 +6166,26 @@ define void @bar() { // This should not crash, even though there is already a value for LLVMBar. Ctx.createFunction(&LLVMBar); } + +TEST_F(SandboxIRTest, OpaqueValue) { + parseIR(C, R"IR( +declare void @bar(metadata) +define void @foo() { + call void @bar(metadata !1) + call void asm "asm", ""() + ret void +} +!1 = !{} +)IR"); + Function &LLVMFoo = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(&LLVMFoo); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Call = cast(&*It++); + auto *Op0 = Call->getOperand(0); + EXPECT_TRUE(isa(Op0)); + auto *Asm = cast(&*It++); + auto *AsmOp0 = Asm->getOperand(0); + EXPECT_TRUE(isa(AsmOp0)); +} From 37d0f20593a65c552d717561efb64c8cf29c1d3c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 19 Feb 2025 11:57:53 -0800 Subject: [PATCH 114/220] Revert "[RISCV] Add a pass to remove ADDI by reassociating to fold into load/store address. (#127151)" This reverts commit c3ebbfd7368ec3e4737427eef602296a868a4ecd. Seeing some test failures on the build bot. --- llvm/lib/Target/RISCV/CMakeLists.txt | 1 - llvm/lib/Target/RISCV/RISCV.h | 3 - llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp | 282 ------- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 - llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 - llvm/test/CodeGen/RISCV/fold-mem-offset.ll | 733 ------------------- llvm/test/CodeGen/RISCV/split-offsets.ll | 23 +- llvm/test/CodeGen/RISCV/xtheadmemidx.ll | 5 +- 8 files changed, 16 insertions(+), 1034 deletions(-) delete mode 100644 llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp delete mode 100644 llvm/test/CodeGen/RISCV/fold-mem-offset.ll diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 5d1ea50eba494..9b23a5ab521c8 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -37,7 +37,6 @@ add_llvm_target(RISCVCodeGen RISCVMakeCompressible.cpp RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp - RISCVFoldMemOffset.cpp RISCVFrameLowering.cpp RISCVGatherScatterLowering.cpp RISCVIndirectBranchTracking.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 641e2eb4094f9..851eea1352852 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -52,9 +52,6 @@ void initializeRISCVVectorPeepholePass(PassRegistry &); FunctionPass *createRISCVOptWInstrsPass(); void initializeRISCVOptWInstrsPass(PassRegistry &); -FunctionPass *createRISCVFoldMemOffsetPass(); -void initializeRISCVFoldMemOffsetPass(PassRegistry &); - FunctionPass *createRISCVMergeBaseOffsetOptPass(); void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp deleted file mode 100644 index 989e9d859d64f..0000000000000 --- a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp +++ /dev/null @@ -1,282 +0,0 @@ -//===- RISCVFoldMemOffset.cpp - Fold ADDI into memory offsets ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===---------------------------------------------------------------------===// -// -// Look for ADDIs that can be removed by folding their immediate into later -// load/store addresses. There may be other arithmetic instructions between the -// addi and load/store that we need to reassociate through. If the final result -// of the arithmetic is only used by load/store addresses, we can fold the -// offset into the all the load/store as long as it doesn't create an offset -// that is too large. -// -//===---------------------------------------------------------------------===// - -#include "RISCV.h" -#include "RISCVSubtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "riscv-fold-mem-offset" -#define RISCV_FOLD_MEM_OFFSET_NAME "RISC-V Fold Memory Offset" - -namespace { - -class RISCVFoldMemOffset : public MachineFunctionPass { -public: - static char ID; - - RISCVFoldMemOffset() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - - bool foldOffset(Register OrigReg, int64_t InitialOffset, - const MachineRegisterInfo &MRI, - DenseMap &FoldableInstrs); - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - StringRef getPassName() const override { return RISCV_FOLD_MEM_OFFSET_NAME; } -}; - -// Wrapper class around a std::optional to allow accumulation. -class FoldableOffset { - std::optional Offset; - -public: - bool hasValue() const { return Offset.has_value(); } - int64_t getValue() const { return *Offset; } - - FoldableOffset &operator=(int64_t RHS) { - Offset = RHS; - return *this; - } - - FoldableOffset &operator+=(int64_t RHS) { - if (!Offset) - Offset = 0; - Offset = (uint64_t)*Offset + (uint64_t)RHS; - return *this; - } - - int64_t operator*() { return *Offset; } -}; - -} // end anonymous namespace - -char RISCVFoldMemOffset::ID = 0; -INITIALIZE_PASS(RISCVFoldMemOffset, DEBUG_TYPE, RISCV_FOLD_MEM_OFFSET_NAME, - false, false) - -FunctionPass *llvm::createRISCVFoldMemOffsetPass() { - return new RISCVFoldMemOffset(); -} - -// Walk forward from the ADDI looking for arithmetic instructions we can -// analyze or memory instructions that use it as part of their address -// calculation. For each arithmetic instruction we lookup how the offset -// contributes to the value in that register use that information to -// calculate the contribution to the output of this instruction. -// Only addition and left shift are supported. -// FIXME: Add multiplication by constant. The constant will be in a register. -bool RISCVFoldMemOffset::foldOffset( - Register OrigReg, int64_t InitialOffset, const MachineRegisterInfo &MRI, - DenseMap &FoldableInstrs) { - // Map to hold how much the offset contributes to the value of this register. - DenseMap RegToOffsetMap; - - // Insert root offset into the map. - RegToOffsetMap[OrigReg] = InitialOffset; - - std::queue Worklist; - Worklist.push(OrigReg); - - while (!Worklist.empty()) { - Register Reg = Worklist.front(); - Worklist.pop(); - - if (!Reg.isVirtual()) - return false; - - for (auto &User : MRI.use_nodbg_instructions(Reg)) { - FoldableOffset Offset; - - switch (User.getOpcode()) { - default: - return false; - case RISCV::ADD: - if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); - I != RegToOffsetMap.end()) - Offset = I->second; - if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); - I != RegToOffsetMap.end()) - Offset += I->second; - break; - case RISCV::SH1ADD: - if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); - I != RegToOffsetMap.end()) - Offset = (uint64_t)I->second << 1; - if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); - I != RegToOffsetMap.end()) - Offset += I->second; - break; - case RISCV::SH2ADD: - if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); - I != RegToOffsetMap.end()) - Offset = (uint64_t)I->second << 2; - if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); - I != RegToOffsetMap.end()) - Offset += I->second; - break; - case RISCV::SH3ADD: - if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); - I != RegToOffsetMap.end()) - Offset = (uint64_t)I->second << 3; - if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); - I != RegToOffsetMap.end()) - Offset += I->second; - break; - case RISCV::ADD_UW: - case RISCV::SH1ADD_UW: - case RISCV::SH2ADD_UW: - case RISCV::SH3ADD_UW: - // Don't fold through the zero extended input. - if (User.getOperand(1).getReg() == Reg) - return false; - if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); - I != RegToOffsetMap.end()) - Offset = I->second; - break; - case RISCV::SLLI: { - unsigned ShAmt = User.getOperand(2).getImm(); - if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); - I != RegToOffsetMap.end()) - Offset = (uint64_t)I->second << ShAmt; - break; - } - case RISCV::LB: - case RISCV::LBU: - case RISCV::SB: - case RISCV::LH: - case RISCV::LH_INX: - case RISCV::LHU: - case RISCV::FLH: - case RISCV::SH: - case RISCV::SH_INX: - case RISCV::FSH: - case RISCV::LW: - case RISCV::LW_INX: - case RISCV::LWU: - case RISCV::FLW: - case RISCV::SW: - case RISCV::SW_INX: - case RISCV::FSW: - case RISCV::LD: - case RISCV::FLD: - case RISCV::SD: - case RISCV::FSD: { - // Can't fold into store value. - if (User.getOperand(0).getReg() == Reg) - return false; - - // Existing offset must be immediate. - if (!User.getOperand(2).isImm()) - return false; - - // Require at least one operation between the ADDI and the load/store. - // We have other optimizations that should handle the simple case. - if (User.getOperand(1).getReg() == OrigReg) - return false; - - auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); - if (I == RegToOffsetMap.end()) - return false; - - int64_t LocalOffset = User.getOperand(2).getImm(); - assert(isInt<12>(LocalOffset)); - int64_t CombinedOffset = (uint64_t)LocalOffset + (uint64_t)I->second; - if (!isInt<12>(CombinedOffset)) - return false; - - FoldableInstrs[&User] = CombinedOffset; - continue; - } - } - - // If we reach here we should have an accumulated offset. - assert(Offset.hasValue() && "Expected an offset"); - - // If the offset is new or changed, add the destination register to the - // work list. - int64_t OffsetVal = Offset.getValue(); - auto P = - RegToOffsetMap.try_emplace(User.getOperand(0).getReg(), OffsetVal); - if (P.second) { - Worklist.push(User.getOperand(0).getReg()); - } else if (P.first->second != OffsetVal) { - P.first->second = OffsetVal; - Worklist.push(User.getOperand(0).getReg()); - } - } - } - - return true; -} - -bool RISCVFoldMemOffset::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - // This optimization may increase size by preventing compression. - if (MF.getFunction().hasOptSize()) - return false; - - MachineRegisterInfo &MRI = MF.getRegInfo(); - - bool MadeChange = false; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { - // FIXME: We can support ADDIW from an LUI+ADDIW pair if the result is - // equivalent to LUI+ADDI. - if (MI.getOpcode() != RISCV::ADDI) - continue; - - // We only want to optimize register ADDIs. - if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) - continue; - - // Ignore 'li'. - if (MI.getOperand(1).getReg() == RISCV::X0) - continue; - - int64_t Offset = MI.getOperand(2).getImm(); - assert(isInt<12>(Offset)); - - DenseMap FoldableInstrs; - - if (!foldOffset(MI.getOperand(0).getReg(), Offset, MRI, FoldableInstrs)) - continue; - - if (FoldableInstrs.empty()) - continue; - - // We can fold this ADDI. - // Rewrite all the instructions. - for (auto [MemMI, NewOffset] : FoldableInstrs) - MemMI->getOperand(2).setImm(NewOffset); - - MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); - MI.eraseFromParent(); - } - } - - return MadeChange; -} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 89e017807363b..167dbb53c5950 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -133,7 +133,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVPostRAExpandPseudoPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVOptWInstrsPass(*PR); - initializeRISCVFoldMemOffsetPass(*PR); initializeRISCVPreRAExpandPseudoPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVVectorPeepholePass(*PR); @@ -591,7 +590,6 @@ void RISCVPassConfig::addMachineSSAOptimization() { addPass(createRISCVVectorPeepholePass()); // TODO: Move this to pre regalloc addPass(createRISCVVMV0EliminationPass()); - addPass(createRISCVFoldMemOffsetPass()); TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 194223eee69eb..2646dfeca4eb6 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -98,7 +98,6 @@ ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: RISC-V Vector Peephole Optimization ; CHECK-NEXT: RISC-V VMV0 Elimination -; CHECK-NEXT: RISC-V Fold Memory Offset ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll deleted file mode 100644 index b12fa509b0bea..0000000000000 --- a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll +++ /dev/null @@ -1,733 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 | FileCheck %s --check-prefixes=CHECK,RV32I -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 | FileCheck %s --check-prefixes=CHECK,RV64I -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV32ZBA -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV64ZBA - -define i64 @test_sh3add(ptr %p, iXLen %x, iXLen %y) { -; RV32I-LABEL: test_sh3add: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: slli a2, a2, 3 -; RV32I-NEXT: add a1, a1, a0 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: lw a2, 480(a1) -; RV32I-NEXT: lw a1, 484(a1) -; RV32I-NEXT: lw a3, 400(a0) -; RV32I-NEXT: lw a0, 404(a0) -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a3, a2 -; RV32I-NEXT: sltu a2, a0, a3 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_sh3add: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: slli a2, a2, 3 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: ld a1, 480(a1) -; RV64I-NEXT: ld a0, 400(a0) -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_sh3add: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh3add a1, a1, a0 -; RV32ZBA-NEXT: sh3add a0, a2, a0 -; RV32ZBA-NEXT: lw a2, 480(a1) -; RV32ZBA-NEXT: lw a1, 484(a1) -; RV32ZBA-NEXT: lw a3, 400(a0) -; RV32ZBA-NEXT: lw a0, 404(a0) -; RV32ZBA-NEXT: add a1, a0, a1 -; RV32ZBA-NEXT: add a0, a3, a2 -; RV32ZBA-NEXT: sltu a2, a0, a3 -; RV32ZBA-NEXT: add a1, a1, a2 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_sh3add: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh3add a1, a1, a0 -; RV64ZBA-NEXT: sh3add a0, a2, a0 -; RV64ZBA-NEXT: ld a1, 480(a1) -; RV64ZBA-NEXT: ld a0, 400(a0) -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %b = getelementptr inbounds nuw i8, ptr %p, i64 400 - %add = add iXLen %x, 10 - %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %add - %0 = load i64, ptr %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %y - %1 = load i64, ptr %arrayidx2, align 8 - %add3 = add nsw i64 %1, %0 - ret i64 %add3 -} - -define signext i32 @test_sh2add(ptr %p, iXLen %x, iXLen %y) { -; RV32I-LABEL: test_sh2add: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: slli a2, a2, 2 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lw a1, 1200(a1) -; RV32I-NEXT: lw a0, 1240(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_sh2add: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: slli a2, a2, 2 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: lw a1, 1200(a1) -; RV64I-NEXT: lw a0, 1240(a0) -; RV64I-NEXT: addw a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_sh2add: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh2add a1, a1, a0 -; RV32ZBA-NEXT: sh2add a0, a2, a0 -; RV32ZBA-NEXT: lw a1, 1200(a1) -; RV32ZBA-NEXT: lw a0, 1240(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_sh2add: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh2add a1, a1, a0 -; RV64ZBA-NEXT: sh2add a0, a2, a0 -; RV64ZBA-NEXT: lw a1, 1200(a1) -; RV64ZBA-NEXT: lw a0, 1240(a0) -; RV64ZBA-NEXT: addw a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 - %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x - %0 = load i32, ptr %arrayidx, align 4 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %add - %1 = load i32, ptr %arrayidx2, align 4 - %add3 = add nsw i32 %1, %0 - ret i32 %add3 -} - -define signext i16 @test_sh1add(ptr %p, iXLen %x, iXLen %y) { -; RV32I-LABEL: test_sh1add: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: slli a2, a2, 1 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lh a1, 1600(a1) -; RV32I-NEXT: lh a0, 1620(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_sh1add: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: slli a2, a2, 1 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: lh a1, 1600(a1) -; RV64I-NEXT: lh a0, 1620(a0) -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_sh1add: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh1add a1, a1, a0 -; RV32ZBA-NEXT: sh1add a0, a2, a0 -; RV32ZBA-NEXT: lh a1, 1600(a1) -; RV32ZBA-NEXT: lh a0, 1620(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: slli a0, a0, 16 -; RV32ZBA-NEXT: srai a0, a0, 16 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_sh1add: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh1add a1, a1, a0 -; RV64ZBA-NEXT: sh1add a0, a2, a0 -; RV64ZBA-NEXT: lh a1, 1600(a1) -; RV64ZBA-NEXT: lh a0, 1620(a0) -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: slli a0, a0, 48 -; RV64ZBA-NEXT: srai a0, a0, 48 -; RV64ZBA-NEXT: ret -entry: - %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 - %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %x - %0 = load i16, ptr %arrayidx, align 2 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %add - %1 = load i16, ptr %arrayidx2, align 2 - %add4 = add i16 %1, %0 - ret i16 %add4 -} - -define zeroext i8 @test_add(ptr %p, iXLen %x, iXLen %y) { -; CHECK-LABEL: test_add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: add a0, a2, a0 -; CHECK-NEXT: lbu a1, 1800(a1) -; CHECK-NEXT: lbu a0, 1810(a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: ret -; -; ZBA-LABEL: test_add: -; ZBA: # %bb.0: # %entry -; ZBA-NEXT: add a1, a0, a1 -; ZBA-NEXT: add a0, a2, a0 -; ZBA-NEXT: lbu a1, 1800(a1) -; ZBA-NEXT: lbu a0, 1810(a0) -; ZBA-NEXT: add a0, a0, a1 -; ZBA-NEXT: andi a0, a0, 255 -; ZBA-NEXT: ret -entry: - %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 - %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x - %0 = load i8, ptr %arrayidx, align 1 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add - %1 = load i8, ptr %arrayidx2, align 1 - %add4 = add i8 %1, %0 - ret i8 %add4 -} - -define i64 @test_sh3add_uw(ptr %p, i32 signext %x, i32 signext %y) { -; RV32I-LABEL: test_sh3add_uw: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 3 -; RV32I-NEXT: slli a2, a2, 3 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: lw a2, 400(a1) -; RV32I-NEXT: lw a1, 404(a1) -; RV32I-NEXT: lw a3, 400(a0) -; RV32I-NEXT: lw a0, 404(a0) -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a3, a2 -; RV32I-NEXT: sltu a2, a0, a3 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_sh3add_uw: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: slli a2, a2, 32 -; RV64I-NEXT: srli a1, a1, 29 -; RV64I-NEXT: srli a2, a2, 29 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: ld a1, 400(a1) -; RV64I-NEXT: ld a0, 400(a0) -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_sh3add_uw: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh3add a1, a1, a0 -; RV32ZBA-NEXT: sh3add a0, a2, a0 -; RV32ZBA-NEXT: lw a2, 400(a1) -; RV32ZBA-NEXT: lw a1, 404(a1) -; RV32ZBA-NEXT: lw a3, 400(a0) -; RV32ZBA-NEXT: lw a0, 404(a0) -; RV32ZBA-NEXT: add a1, a0, a1 -; RV32ZBA-NEXT: add a0, a3, a2 -; RV32ZBA-NEXT: sltu a2, a0, a3 -; RV32ZBA-NEXT: add a1, a1, a2 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_sh3add_uw: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh3add.uw a1, a1, a0 -; RV64ZBA-NEXT: sh3add.uw a0, a2, a0 -; RV64ZBA-NEXT: ld a1, 400(a1) -; RV64ZBA-NEXT: ld a0, 400(a0) -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %b = getelementptr inbounds nuw i8, ptr %p, i64 400 - %idxprom = zext i32 %x to i64 - %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom - %0 = load i64, ptr %arrayidx, align 8 - %idxprom2 = zext i32 %y to i64 - %arrayidx3 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom2 - %1 = load i64, ptr %arrayidx3, align 8 - %add4 = add nsw i64 %1, %0 - ret i64 %add4 -} - -define signext i32 @test_sh2add_uw(ptr %p, i32 signext %x, i32 signext %y) { -; RV32I-LABEL: test_sh2add_uw: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: slli a2, a2, 2 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: lw a1, 1200(a1) -; RV32I-NEXT: lw a0, 1200(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_sh2add_uw: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: slli a2, a2, 32 -; RV64I-NEXT: srli a1, a1, 30 -; RV64I-NEXT: srli a2, a2, 30 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: lw a1, 1200(a1) -; RV64I-NEXT: lw a0, 1200(a0) -; RV64I-NEXT: addw a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_sh2add_uw: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh2add a1, a1, a0 -; RV32ZBA-NEXT: sh2add a0, a2, a0 -; RV32ZBA-NEXT: lw a1, 1200(a1) -; RV32ZBA-NEXT: lw a0, 1200(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_sh2add_uw: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh2add.uw a1, a1, a0 -; RV64ZBA-NEXT: sh2add.uw a0, a2, a0 -; RV64ZBA-NEXT: lw a1, 1200(a1) -; RV64ZBA-NEXT: lw a0, 1200(a0) -; RV64ZBA-NEXT: addw a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 - %idxprom = zext i32 %x to i64 - %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom - %0 = load i32, ptr %arrayidx, align 4 - %idxprom2 = zext i32 %y to i64 - %arrayidx3 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom2 - %1 = load i32, ptr %arrayidx3, align 4 - %add4 = add nsw i32 %1, %0 - ret i32 %add4 -} - -define signext i16 @test_sh1add_uw(ptr %p, i32 signext %x, i32 signext %y) { -; RV32I-LABEL: test_sh1add_uw: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: slli a2, a2, 1 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lh a1, 1600(a1) -; RV32I-NEXT: lh a0, 1620(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_sh1add_uw: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: addi a2, a2, 10 -; RV64I-NEXT: srli a1, a1, 31 -; RV64I-NEXT: slli a2, a2, 32 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: srli a2, a2, 31 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: lh a1, 1600(a1) -; RV64I-NEXT: lh a0, 1600(a0) -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_sh1add_uw: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh1add a1, a1, a0 -; RV32ZBA-NEXT: sh1add a0, a2, a0 -; RV32ZBA-NEXT: lh a1, 1600(a1) -; RV32ZBA-NEXT: lh a0, 1620(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: slli a0, a0, 16 -; RV32ZBA-NEXT: srai a0, a0, 16 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_sh1add_uw: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh1add.uw a1, a1, a0 -; RV64ZBA-NEXT: addi a2, a2, 10 -; RV64ZBA-NEXT: sh1add.uw a0, a2, a0 -; RV64ZBA-NEXT: lh a1, 1600(a1) -; RV64ZBA-NEXT: lh a0, 1600(a0) -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: slli a0, a0, 48 -; RV64ZBA-NEXT: srai a0, a0, 48 -; RV64ZBA-NEXT: ret -entry: - %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 - %idxprom = zext i32 %x to i64 - %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom - %0 = load i16, ptr %arrayidx, align 2 - %add = add i32 %y, 10 - %idxprom2 = zext i32 %add to i64 - %arrayidx3 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom2 - %1 = load i16, ptr %arrayidx3, align 2 - %add5 = add i16 %1, %0 - ret i16 %add5 -} - -define zeroext i8 @test_add_uw(ptr %p, i32 signext %x, i32 signext %y) { -; RV32I-LABEL: test_add_uw: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: lbu a1, 1800(a1) -; RV32I-NEXT: lbu a0, 1800(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: andi a0, a0, 255 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_add_uw: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: slli a2, a2, 32 -; RV64I-NEXT: srli a1, a1, 32 -; RV64I-NEXT: srli a2, a2, 32 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: lbu a1, 1800(a1) -; RV64I-NEXT: lbu a0, 1800(a0) -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: andi a0, a0, 255 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_add_uw: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: add a1, a0, a1 -; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: lbu a1, 1800(a1) -; RV32ZBA-NEXT: lbu a0, 1800(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: andi a0, a0, 255 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_add_uw: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add.uw a1, a1, a0 -; RV64ZBA-NEXT: add.uw a0, a2, a0 -; RV64ZBA-NEXT: lbu a1, 1800(a1) -; RV64ZBA-NEXT: lbu a0, 1800(a0) -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: andi a0, a0, 255 -; RV64ZBA-NEXT: ret -entry: - %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 - %idxprom = zext i32 %x to i64 - %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom - %0 = load i8, ptr %arrayidx, align 1 - %idxprom2 = zext i32 %y to i64 - %arrayidx3 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom2 - %1 = load i8, ptr %arrayidx3, align 1 - %add5 = add i8 %1, %0 - ret i8 %add5 -} - -; The addi is part of the index and used with 2 different scales. -define signext i32 @test_scaled_index_addi(ptr %p, iXLen %x) { -; RV32I-LABEL: test_scaled_index_addi: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a2, a1, 2 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: add a2, a0, a2 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lw a1, 1196(a2) -; RV32I-NEXT: lh a0, 1598(a0) -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_scaled_index_addi: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a2, a1, 2 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: add a2, a0, a2 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lw a1, 1196(a2) -; RV64I-NEXT: lh a0, 1598(a0) -; RV64I-NEXT: addw a0, a1, a0 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_scaled_index_addi: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh2add a2, a1, a0 -; RV32ZBA-NEXT: sh1add a0, a1, a0 -; RV32ZBA-NEXT: lw a1, 1196(a2) -; RV32ZBA-NEXT: lh a0, 1598(a0) -; RV32ZBA-NEXT: add a0, a1, a0 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_scaled_index_addi: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh2add a2, a1, a0 -; RV64ZBA-NEXT: sh1add a0, a1, a0 -; RV64ZBA-NEXT: lw a1, 1196(a2) -; RV64ZBA-NEXT: lh a0, 1598(a0) -; RV64ZBA-NEXT: addw a0, a1, a0 -; RV64ZBA-NEXT: ret -entry: - %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 - %sub = add iXLen %x, -1 - %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %sub - %0 = load i32, ptr %arrayidx, align 4 - %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 - %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %sub - %1 = load i16, ptr %arrayidx2, align 2 - %conv = sext i16 %1 to i32 - %add = add nsw i32 %0, %conv - ret i32 %add -} - -; Offset is a pair of addis. We can fold one of them. -define signext i32 @test_medium_offset(ptr %p, iXLen %x, iXLen %y) { -; RV32I-LABEL: test_medium_offset: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi a0, a0, 2047 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: slli a2, a2, 2 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lw a1, 753(a1) -; RV32I-NEXT: lw a0, 793(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_medium_offset: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi a0, a0, 2047 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: slli a2, a2, 2 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: lw a1, 753(a1) -; RV64I-NEXT: lw a0, 793(a0) -; RV64I-NEXT: addw a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_medium_offset: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: addi a0, a0, 2047 -; RV32ZBA-NEXT: sh2add a1, a1, a0 -; RV32ZBA-NEXT: sh2add a0, a2, a0 -; RV32ZBA-NEXT: lw a1, 753(a1) -; RV32ZBA-NEXT: lw a0, 793(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_medium_offset: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a0, a0, 2047 -; RV64ZBA-NEXT: sh2add a1, a1, a0 -; RV64ZBA-NEXT: sh2add a0, a2, a0 -; RV64ZBA-NEXT: lw a1, 753(a1) -; RV64ZBA-NEXT: lw a0, 793(a0) -; RV64ZBA-NEXT: addw a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 - %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x - %0 = load i32, ptr %arrayidx, align 4 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %add - %1 = load i32, ptr %arrayidx2, align 4 - %add3 = add nsw i32 %1, %0 - ret i32 %add3 -} - -; Offset is a lui+addiw. We can't fold this on RV64. -define signext i32 @test_large_offset(ptr %p, iXLen %x, iXLen %y) { -; RV32I-LABEL: test_large_offset: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a3, 2 -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: slli a2, a2, 2 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: add a1, a0, a1 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: lw a1, -1392(a1) -; RV32I-NEXT: lw a0, -1352(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_large_offset: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: lui a3, 2 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: slli a2, a2, 2 -; RV64I-NEXT: addiw a3, a3, -1392 -; RV64I-NEXT: add a0, a0, a3 -; RV64I-NEXT: add a1, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: lw a1, 0(a1) -; RV64I-NEXT: lw a0, 40(a0) -; RV64I-NEXT: addw a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_large_offset: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: li a3, 1700 -; RV32ZBA-NEXT: sh2add a0, a3, a0 -; RV32ZBA-NEXT: sh2add a1, a1, a0 -; RV32ZBA-NEXT: sh2add a0, a2, a0 -; RV32ZBA-NEXT: lw a1, 0(a1) -; RV32ZBA-NEXT: lw a0, 40(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_large_offset: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: li a3, 1700 -; RV64ZBA-NEXT: sh2add a0, a3, a0 -; RV64ZBA-NEXT: sh2add a1, a1, a0 -; RV64ZBA-NEXT: sh2add a0, a2, a0 -; RV64ZBA-NEXT: lw a1, 0(a1) -; RV64ZBA-NEXT: lw a0, 40(a0) -; RV64ZBA-NEXT: addw a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %g = getelementptr inbounds nuw i8, ptr %p, i64 6800 - %arrayidx = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %x - %0 = load i32, ptr %arrayidx, align 4 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %add - %1 = load i32, ptr %arrayidx2, align 4 - %add3 = add nsw i32 %1, %0 - ret i32 %add3 -} - -; After folding we can CSE the sh2add -define signext i32 @test_cse(ptr %p, iXLen %x) { -; RV32I-LABEL: test_cse: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lw a1, 1200(a0) -; RV32I-NEXT: addi a0, a0, 2047 -; RV32I-NEXT: lw a0, 753(a0) -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: ret -; -; RV64I-LABEL: test_cse: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lw a1, 1200(a0) -; RV64I-NEXT: addi a0, a0, 2047 -; RV64I-NEXT: lw a0, 753(a0) -; RV64I-NEXT: addw a0, a0, a1 -; RV64I-NEXT: ret -; -; RV32ZBA-LABEL: test_cse: -; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: sh2add a0, a1, a0 -; RV32ZBA-NEXT: lw a1, 1200(a0) -; RV32ZBA-NEXT: addi a0, a0, 2047 -; RV32ZBA-NEXT: lw a0, 753(a0) -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: ret -; -; RV64ZBA-LABEL: test_cse: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh2add a0, a1, a0 -; RV64ZBA-NEXT: lw a1, 1200(a0) -; RV64ZBA-NEXT: addi a0, a0, 2047 -; RV64ZBA-NEXT: lw a0, 753(a0) -; RV64ZBA-NEXT: addw a0, a0, a1 -; RV64ZBA-NEXT: ret -entry: - %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 - %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x - %0 = load i32, ptr %arrayidx, align 4 - %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 - %arrayidx1 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x - %1 = load i32, ptr %arrayidx1, align 4 - %add = add nsw i32 %1, %0 - ret i32 %add -} - -define zeroext i8 @test_optsize(ptr %p, iXLen %x, iXLen %y) optsize { -; CHECK-LABEL: test_optsize: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi a0, a0, 1800 -; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: add a0, a2, a0 -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: lbu a0, 10(a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: ret -; -; ZBA-LABEL: test_optsize: -; ZBA: # %bb.0: # %entry -; ZBA-NEXT: addi a0, a0, 1800 -; ZBA-NEXT: add a1, a0, a1 -; ZBA-NEXT: add a0, a2, a0 -; ZBA-NEXT: lbu a1, 0(a1) -; ZBA-NEXT: lbu a0, 10(a0) -; ZBA-NEXT: add a0, a0, a1 -; ZBA-NEXT: andi a0, a0, 255 -; ZBA-NEXT: ret -entry: - %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 - %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x - %0 = load i8, ptr %arrayidx, align 1 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add - %1 = load i8, ptr %arrayidx2, align 1 - %add4 = add i8 %1, %0 - ret i8 %add4 -} - -define zeroext i8 @test_minsize(ptr %p, iXLen %x, iXLen %y) minsize { -; CHECK-LABEL: test_minsize: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi a0, a0, 1800 -; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: add a0, a2, a0 -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: lbu a0, 10(a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: andi a0, a0, 255 -; CHECK-NEXT: ret -; -; ZBA-LABEL: test_minsize: -; ZBA: # %bb.0: # %entry -; ZBA-NEXT: addi a0, a0, 1800 -; ZBA-NEXT: add a1, a0, a1 -; ZBA-NEXT: add a0, a2, a0 -; ZBA-NEXT: lbu a1, 0(a1) -; ZBA-NEXT: lbu a0, 10(a0) -; ZBA-NEXT: add a0, a0, a1 -; ZBA-NEXT: andi a0, a0, 255 -; ZBA-NEXT: ret -entry: - %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 - %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x - %0 = load i8, ptr %arrayidx, align 1 - %add = add iXLen %y, 10 - %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add - %1 = load i8, ptr %arrayidx2, align 1 - %add4 = add i8 %1, %0 - ret i8 %add4 -} diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index b98aa954c09e7..8f5b044c3b3b8 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -14,13 +14,14 @@ define void @test1(ptr %sp, ptr %t, i32 %n) { ; RV32I-NEXT: lui a2, 20 ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: li a3, 2 +; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sw a3, -1920(a0) -; RV32I-NEXT: sw a2, -1916(a0) -; RV32I-NEXT: sw a2, -1920(a1) -; RV32I-NEXT: sw a3, -1916(a1) +; RV32I-NEXT: sw a3, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a2, 0(a1) +; RV32I-NEXT: sw a3, 4(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: @@ -57,16 +58,17 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a4, 20 +; RV32I-NEXT: addi a4, a4, -1920 ; RV32I-NEXT: add a1, a1, a4 ; RV32I-NEXT: add a0, a0, a4 ; RV32I-NEXT: blez a2, .LBB1_2 ; RV32I-NEXT: .LBB1_1: # %while_body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: addi a4, a3, 1 -; RV32I-NEXT: sw a4, -1920(a0) -; RV32I-NEXT: sw a3, -1916(a0) -; RV32I-NEXT: sw a4, -1920(a1) -; RV32I-NEXT: sw a3, -1916(a1) +; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a4, 0(a1) +; RV32I-NEXT: sw a3, 4(a1) ; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: blt a4, a2, .LBB1_1 ; RV32I-NEXT: .LBB1_2: # %while_end @@ -124,10 +126,11 @@ define void @test3(ptr %t) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, 20 ; RV32I-NEXT: li a2, 2 +; RV32I-NEXT: addi a1, a1, -1920 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a2, -1916(a0) -; RV32I-NEXT: sw a1, -1912(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a1, 8(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test3: diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index 578f51a957a75..e761fcb736a87 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -1136,9 +1136,10 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a0, a1, a0 ; RV32XTHEADMEMIDX-NEXT: lui a1, 23 +; RV32XTHEADMEMIDX-NEXT: addi a1, a1, 1792 ; RV32XTHEADMEMIDX-NEXT: add a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: lw a0, 1792(a1) -; RV32XTHEADMEMIDX-NEXT: lw a1, 1796(a1) +; RV32XTHEADMEMIDX-NEXT: lw a0, 0(a1) +; RV32XTHEADMEMIDX-NEXT: lw a1, 4(a1) ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lrd_large_offset: From c9ff8399647cd15cdb9f8853b45854920de17162 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Thu, 20 Feb 2025 01:32:44 +0530 Subject: [PATCH 115/220] [mlir][Linalg] Fix linalg.generic iteration domain collapse for dynamic dims (#118208) This pr fixes how iteration domain of linalg.generic is collapsed when fusing with tensor.expand_shape. Previously, the output_shape for tensor.expand shape was infered, which doesn't always work except some special cases. This patch makes the logic explicitly set the bounds of the new collapsed iteration domain, because we already know them. --------- Co-authored-by: Jakub Kuderski --- .../Linalg/Transforms/ElementwiseOpFusion.cpp | 31 +++++---- .../fuse-with-reshape-by-collapsing.mlir | 63 +++++++++++++------ .../Dialect/Linalg/fusion-push-reshape.mlir | 7 +-- 3 files changed, 67 insertions(+), 34 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index 60cae77644291..f4b6955823085 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -1548,10 +1548,9 @@ static Value getCollapsedOpOperand(Location loc, LinalgOp op, /// Modify the `linalg.index` operations in the original generic op, to its /// value in the collapsed operation. -void generateCollapsedIndexingRegion(Location loc, Block *block, - const CollapsingInfo &collapsingInfo, - ValueRange loopRange, - RewriterBase &rewriter) { +static void generateCollapsedIndexingRegion( + Location loc, Block *block, const CollapsingInfo &collapsingInfo, + ArrayRef loopRange, RewriterBase &rewriter) { OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPointToStart(block); @@ -1572,10 +1571,12 @@ void generateCollapsedIndexingRegion(Location loc, Block *block, Value newIndexVal = rewriter.create(loc, foldedDims.index()); for (auto dim : llvm::reverse(foldedDimsRef.drop_front())) { + Value loopDim = + getValueOrCreateConstantIndexOp(rewriter, loc, loopRange[dim]); indexReplacementVals[dim] = - rewriter.create(loc, newIndexVal, loopRange[dim]); + rewriter.createOrFold(loc, newIndexVal, loopDim); newIndexVal = - rewriter.create(loc, newIndexVal, loopRange[dim]); + rewriter.createOrFold(loc, newIndexVal, loopDim); } indexReplacementVals[foldedDims.value().front()] = newIndexVal; } @@ -1722,14 +1723,13 @@ FailureOr mlir::linalg::collapseOpIterationDims( LinalgOp collapsedOp = createCollapsedOp(op, collapsingInfo, rewriter); Location loc = op->getLoc(); + SmallVector loopBound = + llvm::map_to_vector(loopRanges, [](Range range) { return range.size; }); + if (collapsedOp.hasIndexSemantics()) { // Collect the loop range of the generic op. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(collapsedOp); - SmallVector loopBound = - llvm::map_to_vector(loopRanges, [&](Range range) { - return getValueOrCreateConstantIndexOp(rewriter, loc, range.size); - }); generateCollapsedIndexingRegion(loc, &collapsedOp->getRegion(0).front(), collapsingInfo, loopBound, rewriter); } @@ -1747,15 +1747,22 @@ FailureOr mlir::linalg::collapseOpIterationDims( op.getIndexingMapMatchingResult(originalResult.value()); SmallVector reassociation = getOperandReassociation(indexingMap, collapsingInfo); + assert( + indexingMap.isProjectedPermutation() && + "Expected indexing map to be a projected permutation for collapsing"); + SmallVector resultShape = + applyPermutationMap(indexingMap, ArrayRef(loopBound)); Value result; if (isa(collapsedOpResult.getType())) { MemRefType expandShapeResultType = MemRefType::get( originalResultType.getShape(), originalResultType.getElementType()); result = rewriter.create( - loc, expandShapeResultType, collapsedOpResult, reassociation); + loc, expandShapeResultType, collapsedOpResult, reassociation, + resultShape); } else { result = rewriter.create( - loc, originalResultType, collapsedOpResult, reassociation); + loc, originalResultType, collapsedOpResult, reassociation, + resultShape); } results.push_back(result); } else { diff --git a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir index 7db997cd4c0b5..89734e7542801 100644 --- a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir +++ b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir @@ -225,6 +225,38 @@ func.func @fuse_by_collapsing_dynamic(%arg0 : tensor, // ----- +#map0 = affine_map<(d0, d1) -> (d0, d1)> +func.func @fuse_by_collapsing_dynamic_2(%arg0 : tensor, %sz0: index, %sz1: index) -> tensor { + %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [%sz0, %sz1] : tensor into tensor + %init = tensor.empty(%sz1, %sz0) : tensor + %1 = linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel", "parallel"]} + ins(%0 : tensor) + outs(%init : tensor) { + ^bb0(%b0 : f32, %b1 : f32): + %out = arith.negf %b0 : f32 + linalg.yield %out : f32 + } -> tensor + return %1 : tensor +} + +// CHECK-LABEL: func @fuse_by_collapsing_dynamic_2 +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] +// CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] +// CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[EXPANDED]], %[[C1]] +// CHECK: %[[OUT:.+]] = linalg.generic +// CHECK-SAME: ins(%[[ARG0]] : tensor) +// CHECK-SAME: outs(%{{.*}} : tensor) +// CHECK: %[[EXPANDED_1:.+]] = tensor.expand_shape %[[OUT]] +// CHECK-SAME: output_shape [%[[DIM0]], %[[DIM1]]] +// CHECK: return %[[EXPANDED_1]] + +// ----- + #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3)> func.func @fuse_reductions(%arg0 : tensor<2x?x5xf32>, %arg1 : tensor<2x5xf32>, %sz0: index) -> tensor<2x5xf32> { @@ -425,10 +457,11 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor, %arg1 : tensor<4 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> // CHECK: func @fuse_only_one_reassociation // CHECK-SAME: (%[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor<4x?x?x8xf32>, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index) -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[EXPAND_ARG0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[SZ0]], 4, %[[SZ1]], 8] +// CHECK-DAG: %[[DIM:.+]] = tensor.dim %[[EXPAND_ARG0]], %[[C0]] : tensor +// CHECK-DAG: %[[DIM_2:.+]] = tensor.dim %[[EXPAND_ARG0]], %[[C2]] : tensor // CHECK-DAG: %[[COLLAPSE_ARG0:.+]] = tensor.collapse_shape %[[EXPAND_ARG0]] {{\[}}[0], [1], [2, 3]{{\]}} // CHECK-DAG: %[[COLLAPSE_ARG1_0:.+]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}} // CHECK-DAG: %[[COLLAPSE_ARG1_1:.+]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}} @@ -437,10 +470,7 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor, %arg1 : tensor<4 // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"] // CHECK-SAME: ins(%[[COLLAPSE_ARG0]], %[[COLLAPSE_ARG1_0]] : // CHECK-SAME: outs(%[[COLLAPSE_ARG1_1]] : -// CHECK: %[[DIM:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<4x?x?xf32> -// CHECK: %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C2]] : tensor<4x?x?xf32> -// CHECK: %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C8]] : index -// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[VAL_1]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32> +// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[DIM_2]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32> // CHECK: return %[[EXPANDED_3]] // ----- @@ -475,15 +505,16 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor, %sz0: index, %sz1: // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)> // CHECK: func @fold_non_consecutive_dims( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index) -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C4:.+]] = arith.constant 4 : index -// CHECK: %[[C8:.+]] = arith.constant 8 : index -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[SZ0]], 4, %[[SZ1]], 8] : tensor into tensor -// CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] -// CHECK: %[[DIM_0:.+]] = tensor.dim %[[EXPANDED]], %[[C2]] +// CHECK-DAG: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] +// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[EXPANDED]], %[[C2]] // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM_0]], %[[DIM]]) +// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] +// CHECK-DAG: %[[DIM_2:.+]] = tensor.dim %[[EXPANDED]], %[[C2]] // CHECK: %[[COLLAPSE_INIT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2, 3]{{\]}} // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]] @@ -502,11 +533,7 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor, %sz0: index, %sz1: // CHECK-DAG: %[[T6:.+]] = arith.addi %[[T5]], %[[T3]] // CHECK-DAG: %[[T7:.+]] = arith.index_cast %[[T6]] // CHECK: linalg.yield %[[T7]] -// CHECK: %[[DIM_1:.+]] = tensor.dim %[[GENERIC]], %[[C0]] : tensor -// CHECK: %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor -// CHECK: %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C8]] : index -// CHECK: %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C4]] : index -// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 4] : tensor into tensor +// CHECK: %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[DIM_2]], 8, %[[DIM_1]], 4] : tensor into tensor // CHECK: return %[[EXPANDED_3]] // ----- diff --git a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir index 7acbd843cd1e7..fd3c321722508 100644 --- a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir +++ b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir @@ -5,15 +5,14 @@ // CHECK-LABEL: func @reshape // CHECK-SAME: (%[[A:.*]]: tensor, %[[B:.*]]: tensor<16xf32>, %[[INIT:.*]]: tensor, %[[SZ0:.*]]: index) -// CHECK: %[[C112:.*]] = arith.constant 112 : index // CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[A]] +// CHECK: %[[DIM:.*]] = tensor.dim %[[EXPANDED]], %[[C0]] // CHECK: %[[RI:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]] : tensor into tensor // CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]], #[[$MAP2]]], // CHECK-SAME: iterator_types = ["parallel", "parallel"]} // CHECK-SAME: ins(%[[A]], %[[B]] : tensor, tensor<16xf32>) outs(%[[RI]] : tensor) -// CHECK: %[[DIM:.*]] = tensor.dim %[[R]], %[[C0]] : tensor -// CHECK: %[[VAL_1:.*]] = arith.divsi %[[DIM]], %[[C112]] : index -// CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[VAL_1]], 112, 16] : tensor into tensor +// CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[DIM]], 112, 16] : tensor into tensor // CHECK: return %[[RR]] : tensor func.func @reshape(%A: tensor, %B: tensor<16xf32>, %init: tensor, %sz0: index) -> tensor { %0 = tensor.expand_shape %A [[0, 1], [2]] output_shape [%sz0, 112, 16] From 26e375046dbde27957548d5e9a9885c3eff29019 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 19 Feb 2025 11:30:55 -0800 Subject: [PATCH 116/220] Recommit "[RISCV] Add a pass to remove ADDI by reassociating to fold into load/store address. (#127151)" Tests have been re-generated with recent scheduler changes. Original message: SelectionDAG will not reassociate adds to the end of a chain if there are multiple users of later additions. This prevents isel from folding the immediate into a load/store address. One easy way to see this is accessing an array in a struct with two different indices. An ADDI will be used to get to the start of the array then 2 different SHXADD instructions will be used to add the scaled indices. Finally the SHXADD will be used by different load instructions. We can remove the ADDI by folding the offset into each load. This patch adds a new pass that analyzes how an ADDI constant propagates through address arithmetic. If the arithmetic is only used by a load/store and the offset is small enough, we can adjust the load/store offset and remove the ADDI. This pass is placed before MachineCSE to allow cleanups if some instructions become common after removing offsets from their inputs. This pass gives ~3% improvement on dynamic instruction count on 541.leela_r and 544.nab_r from SPEC2017 for the train data set. There's a ~1% improvement on 557.xz_r. --- llvm/lib/Target/RISCV/CMakeLists.txt | 1 + llvm/lib/Target/RISCV/RISCV.h | 3 + llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp | 282 +++++++ llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 + llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 + llvm/test/CodeGen/RISCV/fold-mem-offset.ll | 733 +++++++++++++++++++ llvm/test/CodeGen/RISCV/split-offsets.ll | 25 +- llvm/test/CodeGen/RISCV/xtheadmemidx.ll | 5 +- 8 files changed, 1035 insertions(+), 17 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp create mode 100644 llvm/test/CodeGen/RISCV/fold-mem-offset.ll diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 9b23a5ab521c8..5d1ea50eba494 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -37,6 +37,7 @@ add_llvm_target(RISCVCodeGen RISCVMakeCompressible.cpp RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp + RISCVFoldMemOffset.cpp RISCVFrameLowering.cpp RISCVGatherScatterLowering.cpp RISCVIndirectBranchTracking.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 851eea1352852..641e2eb4094f9 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -52,6 +52,9 @@ void initializeRISCVVectorPeepholePass(PassRegistry &); FunctionPass *createRISCVOptWInstrsPass(); void initializeRISCVOptWInstrsPass(PassRegistry &); +FunctionPass *createRISCVFoldMemOffsetPass(); +void initializeRISCVFoldMemOffsetPass(PassRegistry &); + FunctionPass *createRISCVMergeBaseOffsetOptPass(); void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp new file mode 100644 index 0000000000000..989e9d859d64f --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVFoldMemOffset.cpp @@ -0,0 +1,282 @@ +//===- RISCVFoldMemOffset.cpp - Fold ADDI into memory offsets ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// Look for ADDIs that can be removed by folding their immediate into later +// load/store addresses. There may be other arithmetic instructions between the +// addi and load/store that we need to reassociate through. If the final result +// of the arithmetic is only used by load/store addresses, we can fold the +// offset into the all the load/store as long as it doesn't create an offset +// that is too large. +// +//===---------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "riscv-fold-mem-offset" +#define RISCV_FOLD_MEM_OFFSET_NAME "RISC-V Fold Memory Offset" + +namespace { + +class RISCVFoldMemOffset : public MachineFunctionPass { +public: + static char ID; + + RISCVFoldMemOffset() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool foldOffset(Register OrigReg, int64_t InitialOffset, + const MachineRegisterInfo &MRI, + DenseMap &FoldableInstrs); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return RISCV_FOLD_MEM_OFFSET_NAME; } +}; + +// Wrapper class around a std::optional to allow accumulation. +class FoldableOffset { + std::optional Offset; + +public: + bool hasValue() const { return Offset.has_value(); } + int64_t getValue() const { return *Offset; } + + FoldableOffset &operator=(int64_t RHS) { + Offset = RHS; + return *this; + } + + FoldableOffset &operator+=(int64_t RHS) { + if (!Offset) + Offset = 0; + Offset = (uint64_t)*Offset + (uint64_t)RHS; + return *this; + } + + int64_t operator*() { return *Offset; } +}; + +} // end anonymous namespace + +char RISCVFoldMemOffset::ID = 0; +INITIALIZE_PASS(RISCVFoldMemOffset, DEBUG_TYPE, RISCV_FOLD_MEM_OFFSET_NAME, + false, false) + +FunctionPass *llvm::createRISCVFoldMemOffsetPass() { + return new RISCVFoldMemOffset(); +} + +// Walk forward from the ADDI looking for arithmetic instructions we can +// analyze or memory instructions that use it as part of their address +// calculation. For each arithmetic instruction we lookup how the offset +// contributes to the value in that register use that information to +// calculate the contribution to the output of this instruction. +// Only addition and left shift are supported. +// FIXME: Add multiplication by constant. The constant will be in a register. +bool RISCVFoldMemOffset::foldOffset( + Register OrigReg, int64_t InitialOffset, const MachineRegisterInfo &MRI, + DenseMap &FoldableInstrs) { + // Map to hold how much the offset contributes to the value of this register. + DenseMap RegToOffsetMap; + + // Insert root offset into the map. + RegToOffsetMap[OrigReg] = InitialOffset; + + std::queue Worklist; + Worklist.push(OrigReg); + + while (!Worklist.empty()) { + Register Reg = Worklist.front(); + Worklist.pop(); + + if (!Reg.isVirtual()) + return false; + + for (auto &User : MRI.use_nodbg_instructions(Reg)) { + FoldableOffset Offset; + + switch (User.getOpcode()) { + default: + return false; + case RISCV::ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = I->second; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH1ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 1; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH2ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 2; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::SH3ADD: + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << 3; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset += I->second; + break; + case RISCV::ADD_UW: + case RISCV::SH1ADD_UW: + case RISCV::SH2ADD_UW: + case RISCV::SH3ADD_UW: + // Don't fold through the zero extended input. + if (User.getOperand(1).getReg() == Reg) + return false; + if (auto I = RegToOffsetMap.find(User.getOperand(2).getReg()); + I != RegToOffsetMap.end()) + Offset = I->second; + break; + case RISCV::SLLI: { + unsigned ShAmt = User.getOperand(2).getImm(); + if (auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + I != RegToOffsetMap.end()) + Offset = (uint64_t)I->second << ShAmt; + break; + } + case RISCV::LB: + case RISCV::LBU: + case RISCV::SB: + case RISCV::LH: + case RISCV::LH_INX: + case RISCV::LHU: + case RISCV::FLH: + case RISCV::SH: + case RISCV::SH_INX: + case RISCV::FSH: + case RISCV::LW: + case RISCV::LW_INX: + case RISCV::LWU: + case RISCV::FLW: + case RISCV::SW: + case RISCV::SW_INX: + case RISCV::FSW: + case RISCV::LD: + case RISCV::FLD: + case RISCV::SD: + case RISCV::FSD: { + // Can't fold into store value. + if (User.getOperand(0).getReg() == Reg) + return false; + + // Existing offset must be immediate. + if (!User.getOperand(2).isImm()) + return false; + + // Require at least one operation between the ADDI and the load/store. + // We have other optimizations that should handle the simple case. + if (User.getOperand(1).getReg() == OrigReg) + return false; + + auto I = RegToOffsetMap.find(User.getOperand(1).getReg()); + if (I == RegToOffsetMap.end()) + return false; + + int64_t LocalOffset = User.getOperand(2).getImm(); + assert(isInt<12>(LocalOffset)); + int64_t CombinedOffset = (uint64_t)LocalOffset + (uint64_t)I->second; + if (!isInt<12>(CombinedOffset)) + return false; + + FoldableInstrs[&User] = CombinedOffset; + continue; + } + } + + // If we reach here we should have an accumulated offset. + assert(Offset.hasValue() && "Expected an offset"); + + // If the offset is new or changed, add the destination register to the + // work list. + int64_t OffsetVal = Offset.getValue(); + auto P = + RegToOffsetMap.try_emplace(User.getOperand(0).getReg(), OffsetVal); + if (P.second) { + Worklist.push(User.getOperand(0).getReg()); + } else if (P.first->second != OffsetVal) { + P.first->second = OffsetVal; + Worklist.push(User.getOperand(0).getReg()); + } + } + } + + return true; +} + +bool RISCVFoldMemOffset::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + // This optimization may increase size by preventing compression. + if (MF.getFunction().hasOptSize()) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + // FIXME: We can support ADDIW from an LUI+ADDIW pair if the result is + // equivalent to LUI+ADDI. + if (MI.getOpcode() != RISCV::ADDI) + continue; + + // We only want to optimize register ADDIs. + if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) + continue; + + // Ignore 'li'. + if (MI.getOperand(1).getReg() == RISCV::X0) + continue; + + int64_t Offset = MI.getOperand(2).getImm(); + assert(isInt<12>(Offset)); + + DenseMap FoldableInstrs; + + if (!foldOffset(MI.getOperand(0).getReg(), Offset, MRI, FoldableInstrs)) + continue; + + if (FoldableInstrs.empty()) + continue; + + // We can fold this ADDI. + // Rewrite all the instructions. + for (auto [MemMI, NewOffset] : FoldableInstrs) + MemMI->getOperand(2).setImm(NewOffset); + + MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MI.eraseFromParent(); + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 167dbb53c5950..89e017807363b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -133,6 +133,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVPostRAExpandPseudoPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVOptWInstrsPass(*PR); + initializeRISCVFoldMemOffsetPass(*PR); initializeRISCVPreRAExpandPseudoPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVVectorPeepholePass(*PR); @@ -590,6 +591,7 @@ void RISCVPassConfig::addMachineSSAOptimization() { addPass(createRISCVVectorPeepholePass()); // TODO: Move this to pre regalloc addPass(createRISCVVMV0EliminationPass()); + addPass(createRISCVFoldMemOffsetPass()); TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 2646dfeca4eb6..194223eee69eb 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -98,6 +98,7 @@ ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: RISC-V Vector Peephole Optimization ; CHECK-NEXT: RISC-V VMV0 Elimination +; CHECK-NEXT: RISC-V Fold Memory Offset ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/RISCV/fold-mem-offset.ll b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll new file mode 100644 index 0000000000000..54eb3c9627691 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fold-mem-offset.ll @@ -0,0 +1,733 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 | FileCheck %s --check-prefixes=CHECK,RV32I +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 | FileCheck %s --check-prefixes=CHECK,RV64I +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV32ZBA +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zba | FileCheck %s --check-prefixes=ZBA,RV64ZBA + +define i64 @test_sh3add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh3add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a1, a1, a0 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a2, 480(a1) +; RV32I-NEXT: lw a1, 484(a1) +; RV32I-NEXT: lw a3, 404(a0) +; RV32I-NEXT: lw a4, 400(a0) +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a0, a4, a2 +; RV32I-NEXT: sltu a2, a0, a4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh3add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: slli a2, a2, 3 +; RV64I-NEXT: add a1, a1, a0 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a1, 480(a1) +; RV64I-NEXT: ld a0, 400(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh3add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh3add a1, a1, a0 +; RV32ZBA-NEXT: sh3add a0, a2, a0 +; RV32ZBA-NEXT: lw a2, 480(a1) +; RV32ZBA-NEXT: lw a1, 484(a1) +; RV32ZBA-NEXT: lw a3, 404(a0) +; RV32ZBA-NEXT: lw a4, 400(a0) +; RV32ZBA-NEXT: add a1, a3, a1 +; RV32ZBA-NEXT: add a0, a4, a2 +; RV32ZBA-NEXT: sltu a2, a0, a4 +; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh3add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh3add a1, a1, a0 +; RV64ZBA-NEXT: sh3add a0, a2, a0 +; RV64ZBA-NEXT: ld a1, 480(a1) +; RV64ZBA-NEXT: ld a0, 400(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %b = getelementptr inbounds nuw i8, ptr %p, i64 400 + %add = add iXLen %x, 10 + %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %add + %0 = load i64, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, iXLen %y + %1 = load i64, ptr %arrayidx2, align 8 + %add3 = add nsw i64 %1, %0 + ret i64 %add3 +} + +define signext i32 @test_sh2add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh2add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, 1200(a1) +; RV32I-NEXT: lw a0, 1240(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh2add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 1200(a1) +; RV64I-NEXT: lw a0, 1240(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh2add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 1200(a1) +; RV32ZBA-NEXT: lw a0, 1240(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh2add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 1200(a1) +; RV64ZBA-NEXT: lw a0, 1240(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +define signext i16 @test_sh1add(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_sh1add: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lh a1, 1600(a1) +; RV32I-NEXT: lh a0, 1620(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh1add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: slli a2, a2, 1 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lh a1, 1600(a1) +; RV64I-NEXT: lh a0, 1620(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh1add: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh1add a1, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a2, a0 +; RV32ZBA-NEXT: lh a1, 1600(a1) +; RV32ZBA-NEXT: lh a0, 1620(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: slli a0, a0, 16 +; RV32ZBA-NEXT: srai a0, a0, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh1add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add a1, a1, a0 +; RV64ZBA-NEXT: sh1add a0, a2, a0 +; RV64ZBA-NEXT: lh a1, 1600(a1) +; RV64ZBA-NEXT: lh a0, 1620(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: slli a0, a0, 48 +; RV64ZBA-NEXT: srai a0, a0, 48 +; RV64ZBA-NEXT: ret +entry: + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %x + %0 = load i16, ptr %arrayidx, align 2 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %add + %1 = load i16, ptr %arrayidx2, align 2 + %add4 = add i16 %1, %0 + ret i16 %add4 +} + +define zeroext i8 @test_add(ptr %p, iXLen %x, iXLen %y) { +; CHECK-LABEL: test_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 1800(a1) +; CHECK-NEXT: lbu a0, 1810(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_add: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 1800(a1) +; ZBA-NEXT: lbu a0, 1810(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} + +define i64 @test_sh3add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh3add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a2, a2, 3 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a2, 404(a0) +; RV32I-NEXT: lw a3, 400(a1) +; RV32I-NEXT: lw a1, 404(a1) +; RV32I-NEXT: lw a4, 400(a0) +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a0, a4, a3 +; RV32I-NEXT: sltu a2, a0, a4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh3add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 29 +; RV64I-NEXT: srli a2, a2, 29 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: ld a1, 400(a1) +; RV64I-NEXT: ld a0, 400(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh3add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh3add a1, a1, a0 +; RV32ZBA-NEXT: sh3add a0, a2, a0 +; RV32ZBA-NEXT: lw a2, 404(a0) +; RV32ZBA-NEXT: lw a3, 400(a1) +; RV32ZBA-NEXT: lw a1, 404(a1) +; RV32ZBA-NEXT: lw a4, 400(a0) +; RV32ZBA-NEXT: add a1, a2, a1 +; RV32ZBA-NEXT: add a0, a4, a3 +; RV32ZBA-NEXT: sltu a2, a0, a4 +; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh3add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh3add.uw a1, a1, a0 +; RV64ZBA-NEXT: sh3add.uw a0, a2, a0 +; RV64ZBA-NEXT: ld a1, 400(a1) +; RV64ZBA-NEXT: ld a0, 400(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %b = getelementptr inbounds nuw i8, ptr %p, i64 400 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom + %0 = load i64, ptr %arrayidx, align 8 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i64], ptr %b, i64 0, i64 %idxprom2 + %1 = load i64, ptr %arrayidx3, align 8 + %add4 = add nsw i64 %1, %0 + ret i64 %add4 +} + +define signext i32 @test_sh2add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh2add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lw a1, 1200(a1) +; RV32I-NEXT: lw a0, 1200(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh2add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 30 +; RV64I-NEXT: srli a2, a2, 30 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lw a1, 1200(a1) +; RV64I-NEXT: lw a0, 1200(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh2add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 1200(a1) +; RV32ZBA-NEXT: lw a0, 1200(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh2add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add.uw a1, a1, a0 +; RV64ZBA-NEXT: sh2add.uw a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 1200(a1) +; RV64ZBA-NEXT: lw a0, 1200(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, i64 %idxprom2 + %1 = load i32, ptr %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + ret i32 %add4 +} + +define signext i16 @test_sh1add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_sh1add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lh a1, 1600(a1) +; RV32I-NEXT: lh a0, 1620(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_sh1add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: addi a2, a2, 10 +; RV64I-NEXT: srli a1, a1, 31 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: srli a2, a2, 31 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lh a1, 1600(a1) +; RV64I-NEXT: lh a0, 1600(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_sh1add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh1add a1, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a2, a0 +; RV32ZBA-NEXT: lh a1, 1600(a1) +; RV32ZBA-NEXT: lh a0, 1620(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: slli a0, a0, 16 +; RV32ZBA-NEXT: srai a0, a0, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_sh1add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add.uw a1, a1, a0 +; RV64ZBA-NEXT: addi a2, a2, 10 +; RV64ZBA-NEXT: sh1add.uw a0, a2, a0 +; RV64ZBA-NEXT: lh a1, 1600(a1) +; RV64ZBA-NEXT: lh a0, 1600(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: slli a0, a0, 48 +; RV64ZBA-NEXT: srai a0, a0, 48 +; RV64ZBA-NEXT: ret +entry: + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom + %0 = load i16, ptr %arrayidx, align 2 + %add = add i32 %y, 10 + %idxprom2 = zext i32 %add to i64 + %arrayidx3 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, i64 %idxprom2 + %1 = load i16, ptr %arrayidx3, align 2 + %add5 = add i16 %1, %0 + ret i16 %add5 +} + +define zeroext i8 @test_add_uw(ptr %p, i32 signext %x, i32 signext %y) { +; RV32I-LABEL: test_add_uw: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: lbu a1, 1800(a1) +; RV32I-NEXT: lbu a0, 1800(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: andi a0, a0, 255 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_add_uw: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: lbu a1, 1800(a1) +; RV64I-NEXT: lbu a0, 1800(a0) +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: andi a0, a0, 255 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_add_uw: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: lbu a1, 1800(a1) +; RV32ZBA-NEXT: lbu a0, 1800(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: andi a0, a0, 255 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_add_uw: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add.uw a1, a1, a0 +; RV64ZBA-NEXT: add.uw a0, a2, a0 +; RV64ZBA-NEXT: lbu a1, 1800(a1) +; RV64ZBA-NEXT: lbu a0, 1800(a0) +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: andi a0, a0, 255 +; RV64ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %idxprom = zext i32 %x to i64 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom + %0 = load i8, ptr %arrayidx, align 1 + %idxprom2 = zext i32 %y to i64 + %arrayidx3 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, i64 %idxprom2 + %1 = load i8, ptr %arrayidx3, align 1 + %add5 = add i8 %1, %0 + ret i8 %add5 +} + +; The addi is part of the index and used with 2 different scales. +define signext i32 @test_scaled_index_addi(ptr %p, iXLen %x) { +; RV32I-LABEL: test_scaled_index_addi: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a2, a1, 2 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 1196(a2) +; RV32I-NEXT: lh a0, 1598(a0) +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_scaled_index_addi: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a1, 1196(a2) +; RV64I-NEXT: lh a0, 1598(a0) +; RV64I-NEXT: addw a0, a1, a0 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_scaled_index_addi: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a2, a1, a0 +; RV32ZBA-NEXT: sh1add a0, a1, a0 +; RV32ZBA-NEXT: lw a1, 1196(a2) +; RV32ZBA-NEXT: lh a0, 1598(a0) +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_scaled_index_addi: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a2, a1, a0 +; RV64ZBA-NEXT: sh1add a0, a1, a0 +; RV64ZBA-NEXT: lw a1, 1196(a2) +; RV64ZBA-NEXT: lh a0, 1598(a0) +; RV64ZBA-NEXT: addw a0, a1, a0 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %sub = add iXLen %x, -1 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %sub + %0 = load i32, ptr %arrayidx, align 4 + %d = getelementptr inbounds nuw i8, ptr %p, i64 1600 + %arrayidx2 = getelementptr inbounds nuw [100 x i16], ptr %d, i64 0, iXLen %sub + %1 = load i16, ptr %arrayidx2, align 2 + %conv = sext i16 %1 to i32 + %add = add nsw i32 %0, %conv + ret i32 %add +} + +; Offset is a pair of addis. We can fold one of them. +define signext i32 @test_medium_offset(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_medium_offset: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, 753(a1) +; RV32I-NEXT: lw a0, 793(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_medium_offset: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 753(a1) +; RV64I-NEXT: lw a0, 793(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_medium_offset: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a0, a0, 2047 +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 753(a1) +; RV32ZBA-NEXT: lw a0, 793(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_medium_offset: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a0, a0, 2047 +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 753(a1) +; RV64ZBA-NEXT: lw a0, 793(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 + %arrayidx = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +; Offset is a lui+addiw. We can't fold this on RV64. +define signext i32 @test_large_offset(ptr %p, iXLen %x, iXLen %y) { +; RV32I-LABEL: test_large_offset: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a3, 2 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: slli a2, a2, 2 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: lw a1, -1392(a1) +; RV32I-NEXT: lw a0, -1352(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_large_offset: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a3, 2 +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: slli a2, a2, 2 +; RV64I-NEXT: addiw a3, a3, -1392 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: lw a1, 0(a1) +; RV64I-NEXT: lw a0, 40(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_large_offset: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: li a3, 1700 +; RV32ZBA-NEXT: sh2add a0, a3, a0 +; RV32ZBA-NEXT: sh2add a1, a1, a0 +; RV32ZBA-NEXT: sh2add a0, a2, a0 +; RV32ZBA-NEXT: lw a1, 0(a1) +; RV32ZBA-NEXT: lw a0, 40(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_large_offset: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a3, 1700 +; RV64ZBA-NEXT: sh2add a0, a3, a0 +; RV64ZBA-NEXT: sh2add a1, a1, a0 +; RV64ZBA-NEXT: sh2add a0, a2, a0 +; RV64ZBA-NEXT: lw a1, 0(a1) +; RV64ZBA-NEXT: lw a0, 40(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %g = getelementptr inbounds nuw i8, ptr %p, i64 6800 + %arrayidx = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [200 x i32], ptr %g, i64 0, iXLen %add + %1 = load i32, ptr %arrayidx2, align 4 + %add3 = add nsw i32 %1, %0 + ret i32 %add3 +} + +; After folding we can CSE the sh2add +define signext i32 @test_cse(ptr %p, iXLen %x) { +; RV32I-LABEL: test_cse: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 1200(a0) +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: lw a0, 753(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_cse: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: slli a1, a1, 2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lw a1, 1200(a0) +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: lw a0, 753(a0) +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBA-LABEL: test_cse: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sh2add a0, a1, a0 +; RV32ZBA-NEXT: lw a1, 1200(a0) +; RV32ZBA-NEXT: addi a0, a0, 2047 +; RV32ZBA-NEXT: lw a0, 753(a0) +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: test_cse: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh2add a0, a1, a0 +; RV64ZBA-NEXT: lw a1, 1200(a0) +; RV64ZBA-NEXT: addi a0, a0, 2047 +; RV64ZBA-NEXT: lw a0, 753(a0) +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: ret +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 1200 + %arrayidx = getelementptr inbounds nuw [100 x i32], ptr %c, i64 0, iXLen %x + %0 = load i32, ptr %arrayidx, align 4 + %f = getelementptr inbounds nuw i8, ptr %p, i64 2800 + %arrayidx1 = getelementptr inbounds nuw [1000 x i32], ptr %f, i64 0, iXLen %x + %1 = load i32, ptr %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + ret i32 %add +} + +define zeroext i8 @test_optsize(ptr %p, iXLen %x, iXLen %y) optsize { +; CHECK-LABEL: test_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, a0, 1800 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: lbu a0, 10(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_optsize: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: addi a0, a0, 1800 +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 0(a1) +; ZBA-NEXT: lbu a0, 10(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} + +define zeroext i8 @test_minsize(ptr %p, iXLen %x, iXLen %y) minsize { +; CHECK-LABEL: test_minsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a0, a0, 1800 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: lbu a1, 0(a1) +; CHECK-NEXT: lbu a0, 10(a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: ret +; +; ZBA-LABEL: test_minsize: +; ZBA: # %bb.0: # %entry +; ZBA-NEXT: addi a0, a0, 1800 +; ZBA-NEXT: add a1, a0, a1 +; ZBA-NEXT: add a0, a2, a0 +; ZBA-NEXT: lbu a1, 0(a1) +; ZBA-NEXT: lbu a0, 10(a0) +; ZBA-NEXT: add a0, a0, a1 +; ZBA-NEXT: andi a0, a0, 255 +; ZBA-NEXT: ret +entry: + %e = getelementptr inbounds nuw i8, ptr %p, i64 1800 + %arrayidx = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %x + %0 = load i8, ptr %arrayidx, align 1 + %add = add iXLen %y, 10 + %arrayidx2 = getelementptr inbounds nuw [1000 x i8], ptr %e, i64 0, iXLen %add + %1 = load i8, ptr %arrayidx2, align 1 + %add4 = add i8 %1, %0 + ret i8 %add4 +} diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index 8f5b044c3b3b8..cecd34956df8c 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -11,17 +11,16 @@ define void @test1(ptr %sp, ptr %t, i32 %n) { ; RV32I-LABEL: test1: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a2, 20 ; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lui a2, 20 ; RV32I-NEXT: li a3, 2 -; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a2, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) +; RV32I-NEXT: sw a3, -1920(a0) +; RV32I-NEXT: sw a2, -1916(a0) +; RV32I-NEXT: sw a2, -1920(a1) +; RV32I-NEXT: sw a3, -1916(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: @@ -58,17 +57,16 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a4, 20 -; RV32I-NEXT: addi a4, a4, -1920 ; RV32I-NEXT: add a1, a1, a4 ; RV32I-NEXT: add a0, a0, a4 ; RV32I-NEXT: blez a2, .LBB1_2 ; RV32I-NEXT: .LBB1_1: # %while_body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: addi a4, a3, 1 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a4, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) +; RV32I-NEXT: sw a4, -1920(a0) +; RV32I-NEXT: sw a3, -1916(a0) +; RV32I-NEXT: sw a4, -1920(a1) +; RV32I-NEXT: sw a3, -1916(a1) ; RV32I-NEXT: mv a3, a4 ; RV32I-NEXT: blt a4, a2, .LBB1_1 ; RV32I-NEXT: .LBB1_2: # %while_end @@ -126,11 +124,10 @@ define void @test3(ptr %t) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, 20 ; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: addi a1, a1, -1920 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a2, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a2, -1916(a0) +; RV32I-NEXT: sw a1, -1912(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test3: diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index e761fcb736a87..578f51a957a75 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -1136,10 +1136,9 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a0, a1, a0 ; RV32XTHEADMEMIDX-NEXT: lui a1, 23 -; RV32XTHEADMEMIDX-NEXT: addi a1, a1, 1792 ; RV32XTHEADMEMIDX-NEXT: add a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: lw a0, 0(a1) -; RV32XTHEADMEMIDX-NEXT: lw a1, 4(a1) +; RV32XTHEADMEMIDX-NEXT: lw a0, 1792(a1) +; RV32XTHEADMEMIDX-NEXT: lw a1, 1796(a1) ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lrd_large_offset: From 84eacd302e54a8a1b2b56684efb422911707c6af Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 19 Feb 2025 20:12:30 +0000 Subject: [PATCH 117/220] [gn build] Port 26e375046dbd --- llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn index e3095e2f3df26..f18e40a2a5744 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn @@ -118,6 +118,7 @@ static_library("LLVMRISCVCodeGen") { "RISCVDeadRegisterDefinitions.cpp", "RISCVExpandAtomicPseudoInsts.cpp", "RISCVExpandPseudoInsts.cpp", + "RISCVFoldMemOffset.cpp", "RISCVFrameLowering.cpp", "RISCVGatherScatterLowering.cpp", "RISCVISelDAGToDAG.cpp", From c833746c6c062677a040d18d837c7fad71939171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= Date: Wed, 19 Feb 2025 21:24:49 +0100 Subject: [PATCH 118/220] [DSE] Make iter order deterministic in removePartiallyOverlappedStores. NFC (#127678) In removePartiallyOverlappedStores we iterate over InstOverlapIntervalsTy which is a DenseMap. Change that map into using MapVector to ensure that we apply the transforms in a deterministic order. I've only seen that the order matters if starting to use names for the instructions created when doing the transforms. But such things are a bit annoying when debugging etc. --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a1649c276de83..f3b53e05c519e 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -174,7 +174,7 @@ static cl::opt EnableInitializesImprovement( // Helper functions //===----------------------------------------------------------------------===// using OverlapIntervalsTy = std::map; -using InstOverlapIntervalsTy = DenseMap; +using InstOverlapIntervalsTy = MapVector; /// Returns true if the end of this instruction can be safely shortened in /// length. From 1761066fc641be529ca45b3cfcf4788b8a7a688d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 19 Feb 2025 12:37:54 -0800 Subject: [PATCH 119/220] [GlobalOpt] Remove Function* argument from tryWidenGlobalArrayAndDests. NFC (#127848) This is only used to get the Module and the LLVMContext. We can get both of those from the GlobalVariable*. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 1a2a27d22ae68..2d046f09f1b2b 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2064,7 +2064,7 @@ static bool destArrayCanBeWidened(CallInst *CI) { return true; } -static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F, +static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, unsigned NumBytesToPad, unsigned NumBytesToCopy) { if (!OldVar->hasInitializer()) @@ -2083,10 +2083,10 @@ static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F, StrData.push_back('\0'); auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad); // Create new padded version of global variable. - Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr); + Constant *SourceReplace = ConstantDataArray::get(OldVar->getContext(), Arr); GlobalVariable *NewGV = new GlobalVariable( - *(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(), - SourceReplace, SourceReplace->getName()); + *(OldVar->getParent()), SourceReplace->getType(), true, + OldVar->getLinkage(), SourceReplace, SourceReplace->getName()); // Copy any other attributes from original global variable // e.g. unamed_addr NewGV->copyAttributesFrom(OldVar); @@ -2114,13 +2114,13 @@ static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad, } } -static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar, +static bool tryWidenGlobalArrayAndDests(GlobalVariable *SourceVar, const unsigned NumBytesToPad, const unsigned NumBytesToCopy, ConstantInt *BytesToCopyOp, ConstantDataArray *SourceDataArray) { auto *NewSourceGV = - widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy); + widenGlobalVariable(SourceVar, NumBytesToPad, NumBytesToCopy); if (!NewSourceGV) return false; @@ -2158,8 +2158,6 @@ static bool tryWidenGlobalArraysUsedByMemcpy( if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI)) continue; - Function *F = CI->getCalledFunction(); - auto *BytesToCopyOp = dyn_cast(CI->getArgOperand(2)); if (!BytesToCopyOp) continue; @@ -2191,7 +2189,7 @@ static bool tryWidenGlobalArraysUsedByMemcpy( .getNumBytesToPadGlobalArray(NumBytesToCopy, SourceDataArray->getType()); if (NumBytesToPad) { - return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy, + return tryWidenGlobalArrayAndDests(GV, NumBytesToPad, NumBytesToCopy, BytesToCopyOp, SourceDataArray); } } From 1a6ed4d06e3c5b05e5ed9873888165c671292b06 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Thu, 20 Feb 2025 02:12:06 +0530 Subject: [PATCH 120/220] [mlir][Vector] Deprecate vector.extractelement/vector.insertelement (#113829) See https://discourse.llvm.org/t/rfc-psa-remove-vector-extractelement-and-vector-insertelement-ops-in-favor-of-vector-extract-and-vector-insert-ops/71116/6 for more information. --- mlir/include/mlir/Dialect/Vector/IR/VectorOps.td | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index c821e7b1527b4..fbbf817ecff98 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -644,11 +644,13 @@ def Vector_ExtractElementOp : Results<(outs AnyType:$result)> { let summary = "extractelement operation"; let description = [{ + Note: This operation is deprecated. Please use vector.extract insert. + Takes a 0-D or 1-D vector and a optional dynamic index position and extracts the scalar at that position. Note that this instruction resembles vector.extract, but is restricted to - 0-D and 1-D vectors and relaxed to dynamic indices. + 0-D and 1-D vectors. If the vector is 0-D, the position must be std::nullopt. @@ -834,11 +836,13 @@ def Vector_InsertElementOp : Results<(outs AnyVectorOfAnyRank:$result)> { let summary = "insertelement operation"; let description = [{ + Note: This operation is deprecated. Please use vector.insert instead. + Takes a scalar source, a 0-D or 1-D destination vector and a dynamic index position and inserts the source into the destination at the proper position. Note that this instruction resembles vector.insert, but is restricted to 0-D - and 1-D vectors and relaxed to dynamic indices. + and 1-D vectors. It is meant to be closer to LLVM's version: https://llvm.org/docs/LangRef.html#insertelement-instruction From 92b07520bcba1134f60d368c3f0d9216ebbe76e5 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 19 Feb 2025 12:44:33 -0800 Subject: [PATCH 121/220] [MemProf] Support cloning through recursive cycles (#127429) In order to facilitate cloning of recursive cycles, we first identify backedges using a standard DFS search from the root callers, then initially defer recursively invoking the cloning function via those edges. This is because the cloning opportunity along the backedge may not be exposed until the current node is cloned for other non-backedge callers that are cold after the earlier recursive cloning, resulting in a cold predecessor of the backedge. So we recursively invoke the cloning function for the backedges during the cloning of the current node for its caller edges (which were sorted to enable handling cold callers first). There was no significant time or memory overhead measured for several large applications. --- .../IPO/MemProfContextDisambiguation.cpp | 233 +++++++++++++++--- llvm/test/ThinLTO/X86/memprof-recursive.ll | 21 +- .../MemProfContextDisambiguation/recursive.ll | 23 +- 3 files changed, 232 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index d748b162d7809..0982fd35401cb 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -89,6 +89,7 @@ STATISTIC(FoundProfiledCalleeMaxDepth, "Maximum depth of profiled callees found via tail calls"); STATISTIC(FoundProfiledCalleeNonUniquelyCount, "Number of profiled callees found via multiple tail call chains"); +STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning"); static cl::opt DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -127,6 +128,10 @@ static cl::opt AllowRecursiveCallsites( "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, cl::desc("Allow cloning of callsites involved in recursive cycles")); +static cl::opt CloneRecursiveContexts( + "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of contexts through recursive cycles")); + // When disabled, try to detect and prevent cloning of recursive contexts. // This is only necessary until we support cloning through recursive cycles. // Leave on by default for now, as disabling requires a little bit of compile @@ -134,7 +139,7 @@ static cl::opt AllowRecursiveCallsites( // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. static cl::opt AllowRecursiveContexts( "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, - cl::desc("Allow cloning of contexts through recursive cycles")); + cl::desc("Allow cloning of contexts having recursive cycles")); namespace llvm { cl::opt EnableMemProfContextDisambiguation( @@ -293,37 +298,40 @@ class CallsiteContextGraph { // TODO: Should this be a map (from Caller node) for more efficient lookup? std::vector> CallerEdges; - // Get the list of edges from which we can compute allocation information - // such as the context ids and allocation type of this node. - const std::vector> * - getEdgesWithAllocInfo() const { - // If node has any callees, compute from those, otherwise compute from - // callers (i.e. if this is the leaf allocation node). - if (!CalleeEdges.empty()) - return &CalleeEdges; + // Returns true if we need to look at the callee edges for determining the + // node context ids and allocation type. + bool useCallerEdgesForContextInfo() const { // Typically if the callee edges are empty either the caller edges are // also empty, or this is an allocation (leaf node). However, if we are // allowing recursive callsites and contexts this will be violated for // incompletely cloned recursive cycles. - assert(CallerEdges.empty() || IsAllocation || + assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation || (AllowRecursiveCallsites && AllowRecursiveContexts)); - if (!CallerEdges.empty() && IsAllocation) - return &CallerEdges; - return nullptr; + // When cloning for a recursive context, during cloning we might be in the + // midst of cloning for a recurrence and have moved context ids off of a + // caller edge onto the clone but not yet off of the incoming caller + // (back) edge. If we don't look at those we miss the fact that this node + // still has context ids of interest. + return IsAllocation || CloneRecursiveContexts; } // Compute the context ids for this node from the union of its edge context // ids. DenseSet getContextIds() const { - DenseSet ContextIds; - auto *Edges = getEdgesWithAllocInfo(); - if (!Edges) - return {}; unsigned Count = 0; - for (auto &Edge : *Edges) + // Compute the number of ids for reserve below. In general we only need to + // look at one set of edges, typically the callee edges, since other than + // allocations and in some cases during recursion cloning, all the context + // ids on the callers should also flow out via callee edges. + for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges) Count += Edge->getContextIds().size(); + DenseSet ContextIds; ContextIds.reserve(Count); - for (auto &Edge : *Edges) + auto Edges = llvm::concat>( + CalleeEdges, useCallerEdgesForContextInfo() + ? CallerEdges + : std::vector>()); + for (const auto &Edge : Edges) ContextIds.insert(Edge->getContextIds().begin(), Edge->getContextIds().end()); return ContextIds; @@ -332,13 +340,14 @@ class CallsiteContextGraph { // Compute the allocation type for this node from the OR of its edge // allocation types. uint8_t computeAllocType() const { - auto *Edges = getEdgesWithAllocInfo(); - if (!Edges) - return (uint8_t)AllocationType::None; uint8_t BothTypes = (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; uint8_t AllocType = (uint8_t)AllocationType::None; - for (auto &Edge : *Edges) { + auto Edges = llvm::concat>( + CalleeEdges, useCallerEdgesForContextInfo() + ? CallerEdges + : std::vector>()); + for (const auto &Edge : Edges) { AllocType |= Edge->AllocTypes; // Bail early if alloc type reached both, no further refinement. if (AllocType == BothTypes) @@ -350,10 +359,11 @@ class CallsiteContextGraph { // The context ids set for this node is empty if its edge context ids are // also all empty. bool emptyContextIds() const { - auto *Edges = getEdgesWithAllocInfo(); - if (!Edges) - return true; - for (auto &Edge : *Edges) { + auto Edges = llvm::concat>( + CalleeEdges, useCallerEdgesForContextInfo() + ? CallerEdges + : std::vector>()); + for (const auto &Edge : Edges) { if (!Edge->getContextIds().empty()) return false; } @@ -434,6 +444,14 @@ class CallsiteContextGraph { // for contexts including this edge. uint8_t AllocTypes = 0; + // Set just before initiating cloning when cloning of recursive contexts is + // enabled. Used to defer cloning of backedges until we have done cloning of + // the callee node for non-backedge caller edges. This exposes cloning + // opportunities through the backedge of the cycle. + // TODO: Note that this is not updated during cloning, and it is unclear + // whether that would be needed. + bool IsBackedge = false; + // The set of IDs for contexts including this edge. DenseSet ContextIds; @@ -722,6 +740,9 @@ class CallsiteContextGraph { void moveCalleeEdgeToNewCaller(const std::shared_ptr &Edge, ContextNode *NewCaller); + void markBackedges(ContextNode *Node, DenseSet &Visited, + DenseSet &CurrentStack); + /// Recursively perform cloning on the graph for the given Node and its /// callers, in order to uniquely identify the allocation behavior of an /// allocation given its context. The context ids of the allocation being @@ -2874,6 +2895,7 @@ template void CallsiteContextGraph::ContextEdge::print( raw_ostream &OS) const { OS << "Edge from Callee " << Callee << " to Caller: " << Caller + << (IsBackedge ? " (BE)" : "") << " AllocTypes: " << getAllocTypeString(AllocTypes); OS << " ContextIds:"; std::vector SortedIds(ContextIds.begin(), ContextIds.end()); @@ -3115,6 +3137,8 @@ void CallsiteContextGraph:: // node (Edge's current callee may be the original node too). assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); + bool EdgeIsRecursive = Edge->Callee == Edge->Caller; + ContextNode *OldCallee = Edge->Callee; // We might already have an edge to the new callee from earlier cloning for a @@ -3181,8 +3205,16 @@ void CallsiteContextGraph:: // If this is a direct recursion edge, use NewCallee (the clone) as the // callee as well, so that any edge updated/created here is also direct // recursive. - if (CalleeToUse == OldCallee) + if (CalleeToUse == OldCallee) { + // If this is a recursive edge, see if we already moved a recursive edge + // (which would have to have been this one) - if we were only moving a + // subset of context ids it would still be on OldCallee. + if (EdgeIsRecursive) { + assert(OldCalleeEdge == Edge); + continue; + } CalleeToUse = NewCallee; + } // The context ids moving to the new callee are the subset of this edge's // context ids and the context ids on the caller edge being moved. DenseSet EdgeContextIdsToMove = @@ -3369,9 +3401,47 @@ void CallsiteContextGraph:: } } +// This is the standard DFS based backedge discovery algorithm. +template +void CallsiteContextGraph::markBackedges( + ContextNode *Node, DenseSet &Visited, + DenseSet &CurrentStack) { + auto I = Visited.insert(Node); + // We should only call this for unvisited nodes. + assert(I.second); + for (auto &CalleeEdge : Node->CalleeEdges) { + auto *Callee = CalleeEdge->Callee; + if (Visited.count(Callee)) { + // Since this was already visited we need to check if it is currently on + // the recursive stack in which case it is a backedge. + if (CurrentStack.count(Callee)) + CalleeEdge->IsBackedge = true; + continue; + } + CurrentStack.insert(Callee); + markBackedges(Callee, Visited, CurrentStack); + CurrentStack.erase(Callee); + } +} + template void CallsiteContextGraph::identifyClones() { + // If we are cloning recursive contexts, find and mark backedges from all root + // callers, using the typical DFS based backedge analysis. DenseSet Visited; + if (CloneRecursiveContexts) { + DenseSet CurrentStack; + for (auto &Entry : NonAllocationCallToContextNodeMap) { + auto *Node = Entry.second; + if (Node->isRemoved()) + continue; + // It is a root if it doesn't have callers. + if (!Node->CallerEdges.empty()) + continue; + markBackedges(Node, Visited, CurrentStack); + assert(CurrentStack.empty()); + } + } for (auto &Entry : AllocationCallToContextNodeMap) { Visited.clear(); identifyClones(Entry.second, Visited, Entry.second->getContextIds()); @@ -3430,6 +3500,14 @@ void CallsiteContextGraph::identifyClones( assert(!is_contained(Node->CallerEdges, Edge)); continue; } + // Defer backedges. See comments further below where these edges are + // handled during the cloning of this Node. + if (Edge->IsBackedge) { + // We should only mark these if cloning recursive contexts, where we + // need to do this deferral. + assert(CloneRecursiveContexts); + continue; + } // Ignore any caller we previously visited via another edge. if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { identifyClones(Edge->Caller, Visited, AllocContextIds); @@ -3483,6 +3561,7 @@ void CallsiteContextGraph::identifyClones( assert(Node->AllocTypes != (uint8_t)AllocationType::None); DenseSet RecursiveContextIds; + assert(AllowRecursiveContexts || !CloneRecursiveContexts); // If we are allowing recursive callsites, but have also disabled recursive // contexts, look for context ids that show up in multiple caller edges. if (AllowRecursiveCallsites && !AllowRecursiveContexts) { @@ -3505,6 +3584,13 @@ void CallsiteContextGraph::identifyClones( // makes it less error-prone. auto CallerEdges = Node->CallerEdges; for (auto &CallerEdge : CallerEdges) { + // Skip any that have been removed by an earlier recursive call. + if (CallerEdge->isRemoved()) { + assert(!is_contained(Node->CallerEdges, CallerEdge)); + continue; + } + assert(CallerEdge->Callee == Node); + // See if cloning the prior caller edge left this node with a single alloc // type or a single caller. In that case no more cloning of Node is needed. if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) @@ -3546,13 +3632,100 @@ void CallsiteContextGraph::identifyClones( // // Then check if by cloning node at least one of the callee edges will be // disambiguated by splitting out different context ids. + // + // However, always do the cloning if this is a backedge, in which case we + // have not yet cloned along this caller edge. assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); assert(Node->AllocTypes != (uint8_t)AllocationType::None); - if (allocTypeToUse(CallerAllocTypeForAlloc) == + if (!CallerEdge->IsBackedge && + allocTypeToUse(CallerAllocTypeForAlloc) == allocTypeToUse(Node->AllocTypes) && allocTypesMatch( - CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) + CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { continue; + } + + if (CallerEdge->IsBackedge) { + // We should only mark these if cloning recursive contexts, where we + // need to do this deferral. + assert(CloneRecursiveContexts); + DeferredBackedges++; + } + + // If this is a backedge, we now do recursive cloning starting from its + // caller since we may have moved unambiguous caller contexts to a clone + // of this Node in a previous iteration of the current loop, giving more + // opportunity for cloning through the backedge. Because we sorted the + // caller edges earlier so that cold caller edges are first, we would have + // visited and cloned this node for any unamibiguously cold non-recursive + // callers before any ambiguous backedge callers. Note that we don't do this + // if the caller is already cloned or visited during cloning (e.g. via a + // different context path from the allocation). + // TODO: Can we do better in the case where the caller was already visited? + if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf && + !Visited.count(CallerEdge->Caller)) { + const auto OrigIdCount = CallerEdge->getContextIds().size(); + // Now do the recursive cloning of this backedge's caller, which was + // deferred earlier. + identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc); + removeNoneTypeCalleeEdges(CallerEdge->Caller); + // See if the recursive call to identifyClones moved the context ids to a + // new edge from this node to a clone of caller, and switch to looking at + // that new edge so that we clone Node for the new caller clone. + bool UpdatedEdge = false; + if (OrigIdCount > CallerEdge->getContextIds().size()) { + for (auto E : Node->CallerEdges) { + // Only interested in clones of the current edges caller. + if (E->Caller->CloneOf != CallerEdge->Caller) + continue; + // See if this edge contains any of the context ids originally on the + // current caller edge. + auto CallerEdgeContextsForAllocNew = + set_intersection(CallerEdgeContextsForAlloc, E->getContextIds()); + if (CallerEdgeContextsForAllocNew.empty()) + continue; + // Make sure we don't pick a previously existing caller edge of this + // Node, which would be processed on a different iteration of the + // outer loop over the saved CallerEdges. + if (std::find(CallerEdges.begin(), CallerEdges.end(), E) != + CallerEdges.end()) + continue; + // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge + // are updated further below for all cases where we just invoked + // identifyClones recursively. + CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew); + CallerEdge = E; + UpdatedEdge = true; + break; + } + } + // If cloning removed this edge (and we didn't update it to a new edge + // above), we're done with this edge. It's possible we moved all of the + // context ids to an existing clone, in which case there's no need to do + // further processing for them. + if (CallerEdge->isRemoved()) + continue; + + // Now we need to update the information used for the cloning decisions + // further below, as we may have modified edges and their context ids. + + // Note if we changed the CallerEdge above we would have already updated + // the context ids. + if (!UpdatedEdge) { + CallerEdgeContextsForAlloc = set_intersection( + CallerEdgeContextsForAlloc, CallerEdge->getContextIds()); + if (CallerEdgeContextsForAlloc.empty()) + continue; + } + // Update the other information that depends on the edges and on the now + // updated CallerEdgeContextsForAlloc. + CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); + CalleeEdgeAllocTypesForCallerEdge.clear(); + for (auto &CalleeEdge : Node->CalleeEdges) { + CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( + CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc)); + } + } // First see if we can use an existing clone. Check each clone and its // callee edges for matching alloc types. diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll index 4b2b5490bc2cb..e1a9084b583b9 100644 --- a/llvm/test/ThinLTO/X86/memprof-recursive.ll +++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll @@ -3,12 +3,15 @@ ;; See llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll for ;; information on how the test was created. +;; -stats requires asserts +; REQUIRES: asserts + ; RUN: opt -thinlto-bc %s >%t.o ;; Check behavior when we enable cloning of contexts involved with recursive -;; cycles, but not through the cycle itself. I.e. until full support for -;; recursion is added, the cloned recursive call from C back to B (line 12) will -;; not be updated to call a clone. +;; cycles, but not through the cycle itself. I.e. with full support for cloning +;; recursive cycles off, the cloned recursive call from C back to B (line 12) +;; will not be updated to call a clone. ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t.o,_Z1Dv,plx \ @@ -19,6 +22,7 @@ ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: -o %t.out 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS @@ -39,7 +43,7 @@ ; RUN: --implicit-check-not="created clone" \ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" -;; Check the default behavior (enabled recursive callsites). +;; Check the default behavior (clone recursive callsites). ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t.o,_Z1Dv,plx \ @@ -47,11 +51,11 @@ ; RUN: -r=%t.o,_Z1Bi,plx \ ; RUN: -r=%t.o,main,plx \ ; RUN: -r=%t.o,_Znam, \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -o %t.out 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ -; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS +; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS \ +; RUN: --check-prefix=CLONE-RECUR-CALLSITES ;; Skipping recursive contexts should prevent spurious call to cloned version of ;; B from the context starting at memprof_recursive.cc:19:13, which is actually @@ -67,6 +71,7 @@ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ ; RUN: -memprof-allow-recursive-contexts=false \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: -o %t.out 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS @@ -76,6 +81,7 @@ ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1 +; CLONE-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1 ;; We should only call the cold clone for the recursive context if we enabled @@ -83,6 +89,7 @@ ; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 ; SKIP-RECUR-CONTEXTS-NOT: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 +; CLONE-RECUR-CALLSITES: 1 memprof-context-disambiguation - Number of backedges with deferred cloning target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll index d234dedc5a57a..1d09b1c1a0cb3 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll @@ -1,5 +1,8 @@ ;; Test recursion handling during cloning. -;; + +;; -stats requires asserts +; REQUIRES: asserts + ;; Original code looks like: ;; ;; #include @@ -35,13 +38,14 @@ ;; The IR was then reduced using llvm-reduce with the expected FileCheck input. ;; Check behavior when we enable cloning of contexts involved with recursive -;; cycles, but not through the cycle itself. I.e. until full support for -;; recursion is added, the cloned recursive call from C back to B (line 12) will -;; not be updated to call a clone. +;; cycles, but not through the cycle itself. I.e. with full support for cloning +;; recursive cycles off, the cloned recursive call from C back to B (line 12) +;; will not be updated to call a clone. ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS @@ -57,13 +61,13 @@ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" \ ; RUN: --check-prefix=ALL -;; Check the default behavior (enabled recursive callsites). +;; Check the default behavior (clone recursive callsites). ; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ -; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ -; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS +; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS \ +; RUN: --check-prefix=CLONE-RECUR-CALLSITES ;; Skipping recursive contexts should prevent spurious call to cloned version of ;; B from the context starting at memprof_recursive.cc:19:13, which is actually @@ -73,6 +77,7 @@ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=true \ ; RUN: -memprof-allow-recursive-contexts=false \ +; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ ; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS @@ -84,6 +89,7 @@ ;; We should only call the cold clone for the recursive context if we enabled ;; recursive contexts via -memprof-allow-recursive-contexts=true (default). ; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1 +; CLONE-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1 ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold @@ -95,6 +101,7 @@ ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi assigned to call function clone _Z1Ci ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci assigned to call function clone _Z1Dv ; ALL: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold +; CLONE-RECUR-CALLSITES: 1 memprof-context-disambiguation - Number of backedges with deferred cloning target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" From 3836559e65b002579e2c6697969557ffbbb8cc7d Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Wed, 19 Feb 2025 21:36:00 +0100 Subject: [PATCH 122/220] [InstCombine] Test for regession with trunc in foldSelectICmpAnd --- llvm/test/Transforms/InstCombine/select-icmp-and.ll | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll index e49c2f6214114..16fb3f34047ee 100644 --- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll +++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll @@ -900,3 +900,15 @@ define i8 @neg_select_trunc_bittest_to_shl_extra_use(i8 %x) { %ret = select i1 %trunc, i8 4, i8 0 ret i8 %ret } + +define i16 @select_trunc_nuw_bittest_or(i8 %x) { +; CHECK-LABEL: @select_trunc_nuw_bittest_or( +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[TMP1]], i16 20, i16 4 +; CHECK-NEXT: ret i16 [[RES]] +; + %trunc = trunc nuw i8 %x to i1 + %select = select i1 %trunc, i16 16, i16 0 + %res = or i16 4, %select + ret i16 %res +} From c0c42c8b3213520700f15587ab8aa4477a286ff9 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 19 Feb 2025 13:08:43 -0800 Subject: [PATCH 123/220] [SandboxIR][NFC] Change order of ifs in Context::getOrCreateValueInternal() (#127891) Move the most common if statement to the top and the least common ones to the bottom. This should save CPU cycles during compilation. This patch also prefixes the llvm variables with the LLVM prefix to make the naming convention in this function more uniform. For example `C` to `LLVMC`. --- llvm/lib/SandboxIR/Context.cpp | 543 +++++++++++++++++---------------- 1 file changed, 276 insertions(+), 267 deletions(-) diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index 38ca60cfac3ec..21039ce7ed834 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -59,26 +59,264 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { if (!Pair.second) return It->second.get(); - if (auto *C = dyn_cast(LLVMV)) { - switch (C->getValueID()) { + // Instruction + if (auto *LLVMI = dyn_cast(LLVMV)) { + switch (LLVMI->getOpcode()) { + case llvm::Instruction::VAArg: { + auto *LLVMVAArg = cast(LLVMV); + It->second = std::unique_ptr(new VAArgInst(LLVMVAArg, *this)); + return It->second.get(); + } + case llvm::Instruction::Freeze: { + auto *LLVMFreeze = cast(LLVMV); + It->second = + std::unique_ptr(new FreezeInst(LLVMFreeze, *this)); + return It->second.get(); + } + case llvm::Instruction::Fence: { + auto *LLVMFence = cast(LLVMV); + It->second = std::unique_ptr(new FenceInst(LLVMFence, *this)); + return It->second.get(); + } + case llvm::Instruction::Select: { + auto *LLVMSel = cast(LLVMV); + It->second = std::unique_ptr(new SelectInst(LLVMSel, *this)); + return It->second.get(); + } + case llvm::Instruction::ExtractElement: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ExtractElementInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::InsertElement: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new InsertElementInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::ShuffleVector: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ShuffleVectorInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::ExtractValue: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ExtractValueInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::InsertValue: { + auto *LLVMIns = cast(LLVMV); + It->second = + std::unique_ptr(new InsertValueInst(LLVMIns, *this)); + return It->second.get(); + } + case llvm::Instruction::Br: { + auto *LLVMBr = cast(LLVMV); + It->second = std::unique_ptr(new BranchInst(LLVMBr, *this)); + return It->second.get(); + } + case llvm::Instruction::Load: { + auto *LLVMLd = cast(LLVMV); + It->second = std::unique_ptr(new LoadInst(LLVMLd, *this)); + return It->second.get(); + } + case llvm::Instruction::Store: { + auto *LLVMSt = cast(LLVMV); + It->second = std::unique_ptr(new StoreInst(LLVMSt, *this)); + return It->second.get(); + } + case llvm::Instruction::Ret: { + auto *LLVMRet = cast(LLVMV); + It->second = std::unique_ptr(new ReturnInst(LLVMRet, *this)); + return It->second.get(); + } + case llvm::Instruction::Call: { + auto *LLVMCall = cast(LLVMV); + It->second = std::unique_ptr(new CallInst(LLVMCall, *this)); + return It->second.get(); + } + case llvm::Instruction::Invoke: { + auto *LLVMInvoke = cast(LLVMV); + It->second = + std::unique_ptr(new InvokeInst(LLVMInvoke, *this)); + return It->second.get(); + } + case llvm::Instruction::CallBr: { + auto *LLVMCallBr = cast(LLVMV); + It->second = + std::unique_ptr(new CallBrInst(LLVMCallBr, *this)); + return It->second.get(); + } + case llvm::Instruction::LandingPad: { + auto *LLVMLPad = cast(LLVMV); + It->second = + std::unique_ptr(new LandingPadInst(LLVMLPad, *this)); + return It->second.get(); + } + case llvm::Instruction::CatchPad: { + auto *LLVMCPI = cast(LLVMV); + It->second = + std::unique_ptr(new CatchPadInst(LLVMCPI, *this)); + return It->second.get(); + } + case llvm::Instruction::CleanupPad: { + auto *LLVMCPI = cast(LLVMV); + It->second = + std::unique_ptr(new CleanupPadInst(LLVMCPI, *this)); + return It->second.get(); + } + case llvm::Instruction::CatchRet: { + auto *LLVMCRI = cast(LLVMV); + It->second = + std::unique_ptr(new CatchReturnInst(LLVMCRI, *this)); + return It->second.get(); + } + case llvm::Instruction::CleanupRet: { + auto *LLVMCRI = cast(LLVMV); + It->second = std::unique_ptr( + new CleanupReturnInst(LLVMCRI, *this)); + return It->second.get(); + } + case llvm::Instruction::GetElementPtr: { + auto *LLVMGEP = cast(LLVMV); + It->second = std::unique_ptr( + new GetElementPtrInst(LLVMGEP, *this)); + return It->second.get(); + } + case llvm::Instruction::CatchSwitch: { + auto *LLVMCatchSwitchInst = cast(LLVMV); + It->second = std::unique_ptr( + new CatchSwitchInst(LLVMCatchSwitchInst, *this)); + return It->second.get(); + } + case llvm::Instruction::Resume: { + auto *LLVMResumeInst = cast(LLVMV); + It->second = + std::unique_ptr(new ResumeInst(LLVMResumeInst, *this)); + return It->second.get(); + } + case llvm::Instruction::Switch: { + auto *LLVMSwitchInst = cast(LLVMV); + It->second = + std::unique_ptr(new SwitchInst(LLVMSwitchInst, *this)); + return It->second.get(); + } + case llvm::Instruction::FNeg: { + auto *LLVMUnaryOperator = cast(LLVMV); + It->second = std::unique_ptr( + new UnaryOperator(LLVMUnaryOperator, *this)); + return It->second.get(); + } + case llvm::Instruction::Add: + case llvm::Instruction::FAdd: + case llvm::Instruction::Sub: + case llvm::Instruction::FSub: + case llvm::Instruction::Mul: + case llvm::Instruction::FMul: + case llvm::Instruction::UDiv: + case llvm::Instruction::SDiv: + case llvm::Instruction::FDiv: + case llvm::Instruction::URem: + case llvm::Instruction::SRem: + case llvm::Instruction::FRem: + case llvm::Instruction::Shl: + case llvm::Instruction::LShr: + case llvm::Instruction::AShr: + case llvm::Instruction::And: + case llvm::Instruction::Or: + case llvm::Instruction::Xor: { + auto *LLVMBinaryOperator = cast(LLVMV); + It->second = std::unique_ptr( + new BinaryOperator(LLVMBinaryOperator, *this)); + return It->second.get(); + } + case llvm::Instruction::AtomicRMW: { + auto *LLVMAtomicRMW = cast(LLVMV); + It->second = std::unique_ptr( + new AtomicRMWInst(LLVMAtomicRMW, *this)); + return It->second.get(); + } + case llvm::Instruction::AtomicCmpXchg: { + auto *LLVMAtomicCmpXchg = cast(LLVMV); + It->second = std::unique_ptr( + new AtomicCmpXchgInst(LLVMAtomicCmpXchg, *this)); + return It->second.get(); + } + case llvm::Instruction::Alloca: { + auto *LLVMAlloca = cast(LLVMV); + It->second = + std::unique_ptr(new AllocaInst(LLVMAlloca, *this)); + return It->second.get(); + } + case llvm::Instruction::ZExt: + case llvm::Instruction::SExt: + case llvm::Instruction::FPToUI: + case llvm::Instruction::FPToSI: + case llvm::Instruction::FPExt: + case llvm::Instruction::PtrToInt: + case llvm::Instruction::IntToPtr: + case llvm::Instruction::SIToFP: + case llvm::Instruction::UIToFP: + case llvm::Instruction::Trunc: + case llvm::Instruction::FPTrunc: + case llvm::Instruction::BitCast: + case llvm::Instruction::AddrSpaceCast: { + auto *LLVMCast = cast(LLVMV); + It->second = std::unique_ptr(new CastInst(LLVMCast, *this)); + return It->second.get(); + } + case llvm::Instruction::PHI: { + auto *LLVMPhi = cast(LLVMV); + It->second = std::unique_ptr(new PHINode(LLVMPhi, *this)); + return It->second.get(); + } + case llvm::Instruction::ICmp: { + auto *LLVMICmp = cast(LLVMV); + It->second = std::unique_ptr(new ICmpInst(LLVMICmp, *this)); + return It->second.get(); + } + case llvm::Instruction::FCmp: { + auto *LLVMFCmp = cast(LLVMV); + It->second = std::unique_ptr(new FCmpInst(LLVMFCmp, *this)); + return It->second.get(); + } + case llvm::Instruction::Unreachable: { + auto *LLVMUnreachable = cast(LLVMV); + It->second = std::unique_ptr( + new UnreachableInst(LLVMUnreachable, *this)); + return It->second.get(); + } + default: + break; + } + It->second = std::unique_ptr( + new OpaqueInst(cast(LLVMV), *this)); + return It->second.get(); + } + // Constant + if (auto *LLVMC = dyn_cast(LLVMV)) { + switch (LLVMC->getValueID()) { case llvm::Value::ConstantIntVal: It->second = std::unique_ptr( - new ConstantInt(cast(C), *this)); + new ConstantInt(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::ConstantFPVal: It->second = std::unique_ptr( - new ConstantFP(cast(C), *this)); + new ConstantFP(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::BlockAddressVal: It->second = std::unique_ptr( - new BlockAddress(cast(C), *this)); + new BlockAddress(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::ConstantTokenNoneVal: It->second = std::unique_ptr( - new ConstantTokenNone(cast(C), *this)); + new ConstantTokenNone(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::ConstantAggregateZeroVal: { - auto *CAZ = cast(C); + auto *CAZ = cast(LLVMC); It->second = std::unique_ptr( new ConstantAggregateZero(CAZ, *this)); auto *Ret = It->second.get(); @@ -91,19 +329,19 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { return Ret; } case llvm::Value::ConstantPointerNullVal: - It->second = std::unique_ptr( - new ConstantPointerNull(cast(C), *this)); + It->second = std::unique_ptr(new ConstantPointerNull( + cast(LLVMC), *this)); return It->second.get(); case llvm::Value::PoisonValueVal: It->second = std::unique_ptr( - new PoisonValue(cast(C), *this)); + new PoisonValue(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::UndefValueVal: It->second = std::unique_ptr( - new UndefValue(cast(C), *this)); + new UndefValue(cast(LLVMC), *this)); return It->second.get(); case llvm::Value::DSOLocalEquivalentVal: { - auto *DSOLE = cast(C); + auto *DSOLE = cast(LLVMC); It->second = std::unique_ptr( new DSOLocalEquivalent(DSOLE, *this)); auto *Ret = It->second.get(); @@ -112,306 +350,77 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { } case llvm::Value::ConstantArrayVal: It->second = std::unique_ptr( - new ConstantArray(cast(C), *this)); + new ConstantArray(cast(LLVMC), *this)); break; case llvm::Value::ConstantStructVal: It->second = std::unique_ptr( - new ConstantStruct(cast(C), *this)); + new ConstantStruct(cast(LLVMC), *this)); break; case llvm::Value::ConstantVectorVal: It->second = std::unique_ptr( - new ConstantVector(cast(C), *this)); + new ConstantVector(cast(LLVMC), *this)); break; case llvm::Value::FunctionVal: It->second = std::unique_ptr( - new Function(cast(C), *this)); + new Function(cast(LLVMC), *this)); break; case llvm::Value::GlobalIFuncVal: It->second = std::unique_ptr( - new GlobalIFunc(cast(C), *this)); + new GlobalIFunc(cast(LLVMC), *this)); break; case llvm::Value::GlobalVariableVal: It->second = std::unique_ptr( - new GlobalVariable(cast(C), *this)); + new GlobalVariable(cast(LLVMC), *this)); break; case llvm::Value::GlobalAliasVal: It->second = std::unique_ptr( - new GlobalAlias(cast(C), *this)); + new GlobalAlias(cast(LLVMC), *this)); break; case llvm::Value::NoCFIValueVal: It->second = std::unique_ptr( - new NoCFIValue(cast(C), *this)); + new NoCFIValue(cast(LLVMC), *this)); break; case llvm::Value::ConstantPtrAuthVal: It->second = std::unique_ptr( - new ConstantPtrAuth(cast(C), *this)); + new ConstantPtrAuth(cast(LLVMC), *this)); break; case llvm::Value::ConstantExprVal: It->second = std::unique_ptr( - new ConstantExpr(cast(C), *this)); + new ConstantExpr(cast(LLVMC), *this)); break; default: - It->second = std::unique_ptr(new Constant(C, *this)); + It->second = std::unique_ptr(new Constant(LLVMC, *this)); break; } auto *NewC = It->second.get(); - for (llvm::Value *COp : C->operands()) - getOrCreateValueInternal(COp, C); + for (llvm::Value *COp : LLVMC->operands()) + getOrCreateValueInternal(COp, LLVMC); return NewC; } - if (auto *Arg = dyn_cast(LLVMV)) { - It->second = std::unique_ptr(new Argument(Arg, *this)); + // Argument + if (auto *LLVMArg = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new Argument(LLVMArg, *this)); return It->second.get(); } - if (auto *BB = dyn_cast(LLVMV)) { + // BasicBlock + if (auto *LLVMBB = dyn_cast(LLVMV)) { assert(isa(U) && "This won't create a SBBB, don't call this function directly!"); - if (auto *SBBB = getValue(BB)) + if (auto *SBBB = getValue(LLVMBB)) return SBBB; return nullptr; } - // TODO: Move these checks after more common Values, like after Instruction. - if (auto *MD = dyn_cast(LLVMV)) { - It->second = std::unique_ptr(new OpaqueValue(MD, *this)); - return It->second.get(); - } - if (auto *Asm = dyn_cast(LLVMV)) { - It->second = std::unique_ptr(new OpaqueValue(Asm, *this)); - return It->second.get(); - } - assert(isa(LLVMV) && "Expected Instruction"); - - switch (cast(LLVMV)->getOpcode()) { - case llvm::Instruction::VAArg: { - auto *LLVMVAArg = cast(LLVMV); - It->second = std::unique_ptr(new VAArgInst(LLVMVAArg, *this)); - return It->second.get(); - } - case llvm::Instruction::Freeze: { - auto *LLVMFreeze = cast(LLVMV); - It->second = std::unique_ptr(new FreezeInst(LLVMFreeze, *this)); - return It->second.get(); - } - case llvm::Instruction::Fence: { - auto *LLVMFence = cast(LLVMV); - It->second = std::unique_ptr(new FenceInst(LLVMFence, *this)); - return It->second.get(); - } - case llvm::Instruction::Select: { - auto *LLVMSel = cast(LLVMV); - It->second = std::unique_ptr(new SelectInst(LLVMSel, *this)); - return It->second.get(); - } - case llvm::Instruction::ExtractElement: { - auto *LLVMIns = cast(LLVMV); - It->second = std::unique_ptr( - new ExtractElementInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::InsertElement: { - auto *LLVMIns = cast(LLVMV); - It->second = std::unique_ptr( - new InsertElementInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::ShuffleVector: { - auto *LLVMIns = cast(LLVMV); - It->second = std::unique_ptr( - new ShuffleVectorInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::ExtractValue: { - auto *LLVMIns = cast(LLVMV); - It->second = - std::unique_ptr(new ExtractValueInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::InsertValue: { - auto *LLVMIns = cast(LLVMV); - It->second = - std::unique_ptr(new InsertValueInst(LLVMIns, *this)); - return It->second.get(); - } - case llvm::Instruction::Br: { - auto *LLVMBr = cast(LLVMV); - It->second = std::unique_ptr(new BranchInst(LLVMBr, *this)); - return It->second.get(); - } - case llvm::Instruction::Load: { - auto *LLVMLd = cast(LLVMV); - It->second = std::unique_ptr(new LoadInst(LLVMLd, *this)); - return It->second.get(); - } - case llvm::Instruction::Store: { - auto *LLVMSt = cast(LLVMV); - It->second = std::unique_ptr(new StoreInst(LLVMSt, *this)); - return It->second.get(); - } - case llvm::Instruction::Ret: { - auto *LLVMRet = cast(LLVMV); - It->second = std::unique_ptr(new ReturnInst(LLVMRet, *this)); - return It->second.get(); - } - case llvm::Instruction::Call: { - auto *LLVMCall = cast(LLVMV); - It->second = std::unique_ptr(new CallInst(LLVMCall, *this)); - return It->second.get(); - } - case llvm::Instruction::Invoke: { - auto *LLVMInvoke = cast(LLVMV); - It->second = std::unique_ptr(new InvokeInst(LLVMInvoke, *this)); - return It->second.get(); - } - case llvm::Instruction::CallBr: { - auto *LLVMCallBr = cast(LLVMV); - It->second = std::unique_ptr(new CallBrInst(LLVMCallBr, *this)); - return It->second.get(); - } - case llvm::Instruction::LandingPad: { - auto *LLVMLPad = cast(LLVMV); - It->second = - std::unique_ptr(new LandingPadInst(LLVMLPad, *this)); + // Metadata + if (auto *LLVMMD = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new OpaqueValue(LLVMMD, *this)); return It->second.get(); } - case llvm::Instruction::CatchPad: { - auto *LLVMCPI = cast(LLVMV); - It->second = - std::unique_ptr(new CatchPadInst(LLVMCPI, *this)); + // InlineAsm + if (auto *LLVMAsm = dyn_cast(LLVMV)) { + It->second = std::unique_ptr(new OpaqueValue(LLVMAsm, *this)); return It->second.get(); } - case llvm::Instruction::CleanupPad: { - auto *LLVMCPI = cast(LLVMV); - It->second = - std::unique_ptr(new CleanupPadInst(LLVMCPI, *this)); - return It->second.get(); - } - case llvm::Instruction::CatchRet: { - auto *LLVMCRI = cast(LLVMV); - It->second = - std::unique_ptr(new CatchReturnInst(LLVMCRI, *this)); - return It->second.get(); - } - case llvm::Instruction::CleanupRet: { - auto *LLVMCRI = cast(LLVMV); - It->second = std::unique_ptr( - new CleanupReturnInst(LLVMCRI, *this)); - return It->second.get(); - } - case llvm::Instruction::GetElementPtr: { - auto *LLVMGEP = cast(LLVMV); - It->second = std::unique_ptr( - new GetElementPtrInst(LLVMGEP, *this)); - return It->second.get(); - } - case llvm::Instruction::CatchSwitch: { - auto *LLVMCatchSwitchInst = cast(LLVMV); - It->second = std::unique_ptr( - new CatchSwitchInst(LLVMCatchSwitchInst, *this)); - return It->second.get(); - } - case llvm::Instruction::Resume: { - auto *LLVMResumeInst = cast(LLVMV); - It->second = - std::unique_ptr(new ResumeInst(LLVMResumeInst, *this)); - return It->second.get(); - } - case llvm::Instruction::Switch: { - auto *LLVMSwitchInst = cast(LLVMV); - It->second = - std::unique_ptr(new SwitchInst(LLVMSwitchInst, *this)); - return It->second.get(); - } - case llvm::Instruction::FNeg: { - auto *LLVMUnaryOperator = cast(LLVMV); - It->second = std::unique_ptr( - new UnaryOperator(LLVMUnaryOperator, *this)); - return It->second.get(); - } - case llvm::Instruction::Add: - case llvm::Instruction::FAdd: - case llvm::Instruction::Sub: - case llvm::Instruction::FSub: - case llvm::Instruction::Mul: - case llvm::Instruction::FMul: - case llvm::Instruction::UDiv: - case llvm::Instruction::SDiv: - case llvm::Instruction::FDiv: - case llvm::Instruction::URem: - case llvm::Instruction::SRem: - case llvm::Instruction::FRem: - case llvm::Instruction::Shl: - case llvm::Instruction::LShr: - case llvm::Instruction::AShr: - case llvm::Instruction::And: - case llvm::Instruction::Or: - case llvm::Instruction::Xor: { - auto *LLVMBinaryOperator = cast(LLVMV); - It->second = std::unique_ptr( - new BinaryOperator(LLVMBinaryOperator, *this)); - return It->second.get(); - } - case llvm::Instruction::AtomicRMW: { - auto *LLVMAtomicRMW = cast(LLVMV); - It->second = - std::unique_ptr(new AtomicRMWInst(LLVMAtomicRMW, *this)); - return It->second.get(); - } - case llvm::Instruction::AtomicCmpXchg: { - auto *LLVMAtomicCmpXchg = cast(LLVMV); - It->second = std::unique_ptr( - new AtomicCmpXchgInst(LLVMAtomicCmpXchg, *this)); - return It->second.get(); - } - case llvm::Instruction::Alloca: { - auto *LLVMAlloca = cast(LLVMV); - It->second = std::unique_ptr(new AllocaInst(LLVMAlloca, *this)); - return It->second.get(); - } - case llvm::Instruction::ZExt: - case llvm::Instruction::SExt: - case llvm::Instruction::FPToUI: - case llvm::Instruction::FPToSI: - case llvm::Instruction::FPExt: - case llvm::Instruction::PtrToInt: - case llvm::Instruction::IntToPtr: - case llvm::Instruction::SIToFP: - case llvm::Instruction::UIToFP: - case llvm::Instruction::Trunc: - case llvm::Instruction::FPTrunc: - case llvm::Instruction::BitCast: - case llvm::Instruction::AddrSpaceCast: { - auto *LLVMCast = cast(LLVMV); - It->second = std::unique_ptr(new CastInst(LLVMCast, *this)); - return It->second.get(); - } - case llvm::Instruction::PHI: { - auto *LLVMPhi = cast(LLVMV); - It->second = std::unique_ptr(new PHINode(LLVMPhi, *this)); - return It->second.get(); - } - case llvm::Instruction::ICmp: { - auto *LLVMICmp = cast(LLVMV); - It->second = std::unique_ptr(new ICmpInst(LLVMICmp, *this)); - return It->second.get(); - } - case llvm::Instruction::FCmp: { - auto *LLVMFCmp = cast(LLVMV); - It->second = std::unique_ptr(new FCmpInst(LLVMFCmp, *this)); - return It->second.get(); - } - case llvm::Instruction::Unreachable: { - auto *LLVMUnreachable = cast(LLVMV); - It->second = std::unique_ptr( - new UnreachableInst(LLVMUnreachable, *this)); - return It->second.get(); - } - default: - break; - } - - It->second = std::unique_ptr( - new OpaqueInst(cast(LLVMV), *this)); - return It->second.get(); + llvm_unreachable("Unhandled LLVMV type!"); } Argument *Context::getOrCreateArgument(llvm::Argument *LLVMArg) { From 5f8b2568219d5e516928aed67f13b59de8ccee17 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Wed, 19 Feb 2025 13:24:12 -0800 Subject: [PATCH 124/220] Check the type of Objective-C++ instance variables in WebKit member variable checkers. (#127570) Like a C++ member variable, every Objective-C++ instance variable must be a RefPtr, Ref CheckedPtr, or CheckedRef to an object, not a raw pointer or reference. --- .../WebKit/RawPtrRefMemberChecker.cpp | 40 +++++++++++++++++-- .../Checkers/WebKit/unchecked-members-objc.mm | 35 ++++++++++++++++ .../Checkers/WebKit/uncounted-members-objc.mm | 35 ++++++++++++++++ 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm create mode 100644 clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp index 79f88553feb95..963f59831c8ed 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp @@ -61,6 +61,11 @@ class RawPtrRefMemberChecker Checker->visitRecordDecl(RD); return true; } + + bool VisitObjCContainerDecl(const ObjCContainerDecl *CD) override { + Checker->visitObjCDecl(CD); + return true; + } }; LocalVisitor visitor(this); @@ -87,6 +92,31 @@ class RawPtrRefMemberChecker } } + void visitObjCDecl(const ObjCContainerDecl *CD) const { + if (auto *ID = dyn_cast(CD)) { + for (auto *Ivar : ID->ivars()) + visitIvarDecl(CD, Ivar); + return; + } + if (auto *ID = dyn_cast(CD)) { + for (auto *Ivar : ID->ivars()) + visitIvarDecl(CD, Ivar); + return; + } + } + + void visitIvarDecl(const ObjCContainerDecl *CD, + const ObjCIvarDecl *Ivar) const { + const Type *IvarType = Ivar->getType().getTypePtrOrNull(); + if (!IvarType) + return; + if (auto *IvarCXXRD = IvarType->getPointeeCXXRecordDecl()) { + std::optional IsCompatible = isPtrCompatible(IvarCXXRD); + if (IsCompatible && *IsCompatible) + reportBug(Ivar, IvarType, IvarCXXRD, CD); + } + } + bool shouldSkipDecl(const RecordDecl *RD) const { if (!RD->isThisDeclarationADefinition()) return true; @@ -121,9 +151,10 @@ class RawPtrRefMemberChecker return false; } - void reportBug(const FieldDecl *Member, const Type *MemberType, + template + void reportBug(const DeclType *Member, const Type *MemberType, const CXXRecordDecl *MemberCXXRD, - const RecordDecl *ClassCXXRD) const { + const ParentDeclType *ClassCXXRD) const { assert(Member); assert(MemberType); assert(MemberCXXRD); @@ -131,7 +162,10 @@ class RawPtrRefMemberChecker SmallString<100> Buf; llvm::raw_svector_ostream Os(Buf); - Os << "Member variable "; + if (isa(ClassCXXRD)) + Os << "Instance variable "; + else + Os << "Member variable "; printQuotedName(Os, Member); Os << " in "; printQuotedQualifiedName(Os, ClassCXXRD); diff --git a/clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm b/clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm new file mode 100644 index 0000000000000..a9a9a367fb9f4 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/unchecked-members-objc.mm @@ -0,0 +1,35 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUncheckedPtrMemberChecker -verify %s + +#include "mock-types.h" + +__attribute__((objc_root_class)) +@interface NSObject ++ (instancetype) alloc; +- (instancetype) init; +- (instancetype)retain; +- (void)release; +@end + +void doSomeWork(); + +@interface SomeObjC : NSObject { + CheckedObj* _unchecked1; +// expected-warning@-1{{Instance variable '_unchecked1' in 'SomeObjC' is a raw pointer to CheckedPtr capable type 'CheckedObj'}} + CheckedPtr _counted1; + [[clang::suppress]] CheckedObj* _unchecked2; +} +- (void)doWork; +@end + +@implementation SomeObjC { + CheckedObj* _unchecked3; +// expected-warning@-1{{Instance variable '_unchecked3' in 'SomeObjC' is a raw pointer to CheckedPtr capable type 'CheckedObj'}} + CheckedPtr _counted2; + [[clang::suppress]] CheckedObj* _unchecked4; +} + +- (void)doWork { + doSomeWork(); +} + +@end diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm new file mode 100644 index 0000000000000..83b08a6841d26 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-members-objc.mm @@ -0,0 +1,35 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.NoUncountedMemberChecker -verify %s + +#include "mock-types.h" + +__attribute__((objc_root_class)) +@interface NSObject ++ (instancetype) alloc; +- (instancetype) init; +- (instancetype)retain; +- (void)release; +@end + +void doSomeWork(); + +@interface SomeObjC : NSObject { + RefCountable* _uncounted1; +// expected-warning@-1{{Instance variable '_uncounted1' in 'SomeObjC' is a raw pointer to ref-countable type 'RefCountable'}} + RefPtr _counted1; + [[clang::suppress]] RefCountable* _uncounted2; +} +- (void)doWork; +@end + +@implementation SomeObjC { + RefCountable* _uncounted3; +// expected-warning@-1{{Instance variable '_uncounted3' in 'SomeObjC' is a raw pointer to ref-countable type 'RefCountable'}} + RefPtr _counted2; + [[clang::suppress]] RefCountable* _uncounted4; +} + +- (void)doWork { + doSomeWork(); +} + +@end From 526b34a5e15ad4a8633713cdffe42286ff390138 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 13 Jan 2025 21:57:03 +0000 Subject: [PATCH 125/220] adding rootsignature to obj2yaml --- llvm/include/llvm/BinaryFormat/DXContainer.h | 9 +++++++ llvm/include/llvm/Object/DXContainer.h | 5 ++-- .../include/llvm/ObjectYAML/DXContainerYAML.h | 22 ++++++--------- llvm/lib/Object/DXContainer.cpp | 10 +++---- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 14 +++------- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 15 +++++------ .../RootSignatures/FlagsElement.ll | 27 +++++++++++++++++++ llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 9 ++++--- 8 files changed, 69 insertions(+), 42 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index bd5a796c0b31c..ad39d55e55057 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -63,6 +63,15 @@ struct ShaderHash { void swapBytes() { sys::swapByteOrder(Flags); } }; +struct RootSignatureDesc { + uint32_t Version; + uint32_t Flags; + void swapBytes() { + sys::swapByteOrder(Version); + sys::swapByteOrder(Flags); + } +}; + struct ContainerVersion { uint16_t Major; uint16_t Minor; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index c3a2f756bd683..d7a397b608513 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -309,7 +309,7 @@ class DXContainer { std::optional ShaderFeatureFlags; std::optional Hash; std::optional PSVInfo; - std::optional RootSignature; + std::optional RootSignature; DirectX::Signature InputSignature; DirectX::Signature OutputSignature; DirectX::Signature PatchConstantSignature; @@ -406,7 +406,8 @@ class DXContainer { std::optional getShaderHash() const { return Hash; } - std::optional getRootSignature() const { + std::optional + getRootSignature() const { return RootSignature; } diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index ecad35e82b155..9233f64b8e506 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -73,20 +73,14 @@ struct ShaderHash { std::vector Digest; }; -#define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; -struct RootSignatureYamlDesc { - RootSignatureYamlDesc() = default; - RootSignatureYamlDesc(const object::DirectX::RootSignature &Data); - uint32_t Version; - uint32_t NumParameters; - uint32_t RootParametersOffset; - uint32_t NumStaticSamplers; - uint32_t StaticSamplersOffset; - uint32_t getEncodedFlags(); +struct RootSignatureDesc { + RootSignatureDesc() = default; + RootSignatureDesc(const dxbc::RootSignatureDesc &Data); -#include "llvm/BinaryFormat/DXContainerConstants.def" + uint32_t Version; + uint32_t Flags; }; using ResourceFlags = dxbc::PSV::ResourceFlags; @@ -176,7 +170,7 @@ struct Part { std::optional Hash; std::optional Info; std::optional Signature; - std::optional RootSignature; + std::optional RootSignature; }; struct Object { @@ -259,9 +253,9 @@ template <> struct MappingTraits { static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El); }; -template <> struct MappingTraits { +template <> struct MappingTraits { static void mapping(IO &IO, - DXContainerYAML::RootSignatureYamlDesc &RootSignature); + DXContainerYAML::RootSignatureDesc &RootSignature); }; } // namespace yaml diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 1eb1453c65147..9f2a50829ecc6 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -10,7 +10,7 @@ #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" -#include "llvm/Support/Endian.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; @@ -98,11 +98,10 @@ Error DXContainer::parseHash(StringRef Part) { } Error DXContainer::parseRootSignature(StringRef Part) { - if (RootSignature) - return parseFailed("More than one RTS0 part is present in the file"); - RootSignature = DirectX::RootSignature(); - if (Error Err = RootSignature->parse(Part)) + dxbc::RootSignatureDesc Desc; + if (Error Err = readStruct(Part, Part.begin(), Desc)) return Err; + RootSignature = Desc; return Error::success(); } @@ -210,6 +209,7 @@ Error DXContainer::parsePartOffsets() { case dxbc::PartType::RTS0: if (Error Err = parseRootSignature(PartData)) return Err; + break; } } diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index f6ed09c857bb7..bca55782fad98 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -265,16 +265,10 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { case dxbc::PartType::RTS0: if (!P.RootSignature.has_value()) continue; - - mcdxbc::RootSignatureDesc RS; - RS.Flags = P.RootSignature->getEncodedFlags(); - RS.Version = P.RootSignature->Version; - RS.NumParameters = P.RootSignature->NumParameters; - RS.RootParametersOffset = P.RootSignature->RootParametersOffset; - RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers; - RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; - - RS.write(OS); + dxbc::RootSignatureDesc RS = {P.RootSignature->Version, P.RootSignature->Flags}; + if (sys::IsBigEndianHost) + RS.swapBytes(); + OS.write(reinterpret_cast(&RS), sizeof(dxbc::RootSignatureDesc)); break; } uint64_t BytesWritten = OS.tell() - DataStart; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index f03c7da65999d..22ab123152232 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -66,6 +66,10 @@ DXContainerYAML::ShaderHash::ShaderHash(const dxbc::ShaderHash &Data) memcpy(Digest.data(), &Data.Digest[0], 16); } +DXContainerYAML::RootSignatureDesc::RootSignatureDesc(const dxbc::RootSignatureDesc &Data) + : Version(Data.Version), Flags(Data.Flags) { +} + DXContainerYAML::PSVInfo::PSVInfo() : Version(0) { memset(&Info, 0, sizeof(Info)); } @@ -209,15 +213,10 @@ void MappingTraits::mapping( IO.mapRequired("Parameters", S.Parameters); } -void MappingTraits::mapping( - IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) { +void MappingTraits::mapping( + IO &IO, DXContainerYAML::RootSignatureDesc &S) { IO.mapRequired("Version", S.Version); - IO.mapRequired("NumParameters", S.NumParameters); - IO.mapRequired("RootParametersOffset", S.RootParametersOffset); - IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); - IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); -#define ROOT_ELEMENT_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); -#include "llvm/BinaryFormat/DXContainerConstants.def" + IO.mapRequired("Flags", S.Flags); } void MappingTraits::mapping(IO &IO, diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll new file mode 100644 index 0000000000000..402f03a4dd589 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll @@ -0,0 +1,27 @@ +; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s +; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC + +target triple = "dxil-unknown-shadermodel6.0-compute" + +; CHECK: @dx.rts0 = private constant [8 x i8] c"{{.*}}", section "RTS0", align 4 + + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + + +!dx.rootsignatures = !{!2} ; list of function/root signature pairs +!2 = !{ ptr @main, !3 } ; function, root signature +!3 = !{ !4 } ; list of root signature elements +!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout + + +; DXC: - Name: RTS0 +; DXC-NEXT: Size: 8 +; DXC-NEXT: RootSignature: +; DXC-NEXT: Version: 1.0 +; DXC-NEXT: Flags: AllowInputAssemblerInputLayout diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index f3ef1b6a27bcf..90ee47cd46994 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -7,9 +7,11 @@ //===----------------------------------------------------------------------===// #include "obj2yaml.h" +#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/DXContainer.h" #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include @@ -154,9 +156,10 @@ dumpDXContainer(MemoryBufferRef Source) { case dxbc::PartType::Unknown: break; case dxbc::PartType::RTS0: - std::optional RS = Container.getRootSignature(); - if (RS.has_value()) - NewPart.RootSignature = DXContainerYAML::RootSignatureYamlDesc(*RS); + std::optional RS = Container.getRootSignature(); + if (RS && RS.has_value()) + NewPart.RootSignature = DXContainerYAML::RootSignatureDesc(*RS); + break; break; } } From e84a8e1df1d8fa0552e83c2c3f52e59881f10ffe Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 13 Jan 2025 22:31:14 +0000 Subject: [PATCH 126/220] adding test --- .../RootSignature-FlagsRootElement.yaml | 242 ++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml new file mode 100644 index 0000000000000..5435c432a073e --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml @@ -0,0 +1,242 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + FileSize: 1672 + PartCount: 7 + PartOffsets: [ 60, 1496, 1512, 1540, 1556, 1572, 1588 ] +Parts: + - Name: DXIL + Size: 1428 + Program: + MajorVersion: 6 + MinorVersion: 0 + ShaderKind: 5 + Size: 357 + DXILMajorVersion: 1 + DXILMinorVersion: 0 + DXILSize: 1404 + DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0x5C, + 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0, + 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23, + 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32, + 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19, + 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2, + 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14, + 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88, + 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5, + 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90, + 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C, + 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6, + 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x32, + 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13, 0x22, + 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84, 0xA1, + 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB, 0x84, + 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8, 0xC1, + 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40, 0x14, + 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54, 0x11, + 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30, 0x2, 0x50, + 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23, 0x0, 0x3, 0x0, + 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74, 0x60, 0x87, + 0x36, 0x68, 0x87, 0x79, 0x68, 0x3, 0x72, 0xC0, + 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D, 0xD0, 0xE, 0x7A, + 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A, 0x30, 0x7, 0x72, + 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x71, + 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x78, + 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9, 0x10, 0x7, 0x76, + 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D, 0x90, 0xE, 0x73, + 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72, 0xD0, 0x6, 0xE9, + 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76, 0x40, 0x7, 0x6D, + 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A, 0x10, 0x7, 0x76, + 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, + 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76, 0x40, 0x7, 0x7A, + 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE, 0x80, 0x7, 0x7A, + 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x7A, + 0x60, 0x7, 0x74, 0x30, 0xE4, 0x21, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xB, + 0x4, 0x6, 0x0, 0x0, 0x0, 0x32, 0x1E, 0x98, 0xC, + 0x19, 0x11, 0x4C, 0x90, 0x8C, 0x9, 0x26, 0x47, + 0xC6, 0x4, 0x43, 0xBA, 0x12, 0x28, 0x86, 0x11, + 0x80, 0x42, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xCB, + 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1, + 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38, + 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78, + 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED, + 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E, + 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30, + 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B, + 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3, + 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B, + 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A, + 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87, + 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1, + 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0, + 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21, + 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66, + 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43, + 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84, + 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7, + 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68, + 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70, + 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76, + 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87, + 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, + 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE, + 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8, + 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C, + 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21, + 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6, + 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43, + 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94, + 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F, + 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3, + 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70, + 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7, + 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0, + 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2, + 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1, + 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28, + 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2, + 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C, + 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18, + 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B, + 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68, + 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78, + 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50, + 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C, + 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1, + 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0, + 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3, + 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC, + 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B, + 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98, + 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A, + 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10, + 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30, + 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17, + 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48, + 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B, + 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3, + 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4, + 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76, + 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87, + 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED, + 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5, + 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3, + 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9, + 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23, + 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8, + 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D, + 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21, + 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D, + 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58, + 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A, + 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87, + 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1, + 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5, + 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18, + 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74, + 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81, + 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43, + 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0, + 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE, + 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1, + 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5, + 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73, + 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87, + 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3, + 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC, + 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48, + 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D, + 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C, + 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0, + 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40, + 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0, + 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68, + 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8, + 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38, + 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83, + 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94, + 0x3, 0x0, 0x0, 0x0, 0x0, 0x79, 0x28, 0x0, 0x0, + 0x2A, 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, + 0x10, 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, + 0x2, 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, + 0xC6, 0x0, 0x13, 0x74, 0x12, 0xA9, 0xB7, 0x37, + 0x3A, 0x23, 0xB6, 0xB0, 0xB3, 0xB9, 0x23, 0x8C, + 0xCD, 0x1D, 0xA2, 0x2D, 0x2C, 0xCD, 0x6D, 0x8, + 0x42, 0x1, 0xC, 0x41, 0x38, 0x82, 0x21, 0x8, 0x87, + 0x30, 0x4, 0xE1, 0x18, 0x86, 0x20, 0x1C, 0xC4, + 0x18, 0x84, 0xA0, 0x18, 0x43, 0x90, 0x8C, 0x41, + 0x20, 0x94, 0x31, 0xC, 0x82, 0x71, 0x8C, 0x41, + 0x28, 0x8E, 0x31, 0xC, 0x45, 0x51, 0x8C, 0x41, + 0x40, 0x9C, 0x31, 0x14, 0xC4, 0x0, 0x0, 0x8F, + 0x89, 0xC8, 0xF0, 0x5C, 0xE4, 0xDE, 0xDE, 0xE8, + 0xE6, 0xD2, 0xCE, 0xDC, 0xC2, 0xE8, 0xEA, 0xE4, + 0xCA, 0xE6, 0x86, 0x12, 0x28, 0xC6, 0x21, 0xC3, + 0x73, 0x99, 0x43, 0xB, 0x23, 0x2B, 0x93, 0x6B, + 0x7A, 0x23, 0x2B, 0x63, 0x1B, 0x4A, 0xB0, 0x18, + 0x85, 0xC, 0xCF, 0xC5, 0xAE, 0x4C, 0x6E, 0x2E, + 0xED, 0xCD, 0x6D, 0x28, 0x1, 0x63, 0x1C, 0x32, + 0x3C, 0x97, 0x32, 0x37, 0x3A, 0xB9, 0x3C, 0xA8, + 0xB7, 0x34, 0x37, 0xBA, 0xB9, 0xA1, 0x4, 0xF, + 0x0, 0x0, 0x71, 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, + 0x0, 0x6, 0x40, 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, + 0x61, 0x20, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, + 0x4, 0x1, 0x86, 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, + 0x0, 0x0, 0x7, 0x50, 0x10, 0xCD, 0x14, 0x61, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + - Name: SFI0 + Size: 8 + - Name: HASH + Size: 20 + Hash: + IncludesSource: false + Digest: [ 0xCE, 0xA, 0x5B, 0x9C, 0xBF, 0x9A, 0xBB, 0x5, + 0x19, 0xC5, 0x96, 0x78, 0x49, 0x89, 0x5C, 0x6B ] + - Name: ISG1 + Size: 8 + Signature: + Parameters: [] + - Name: OSG1 + Size: 8 + Signature: + Parameters: [] + - Name: RTS0 + Size: 8 + RootSignature: + Version: 1 + Flags: 8 + - Name: PSV0 + Size: 76 + PSVInfo: + Version: 3 + ShaderStage: 5 + MinimumWaveLaneCount: 0 + MaximumWaveLaneCount: 4294967295 + UsesViewID: 0 + SigInputVectors: 0 + SigOutputVectors: [ 0, 0, 0, 0 ] + NumThreadsX: 1 + NumThreadsY: 1 + NumThreadsZ: 1 + EntryName: main + ResourceStride: 24 + Resources: [] + SigInputElements: [] + SigOutputElements: [] + SigPatchOrPrimElements: [] + InputOutputMap: + - [ ] + - [ ] + - [ ] + - [ ] + +# CHECK: - Name: RTS0 +# CHECK-NEXT: Size: 8 +# CHECK-NEXT: RootSignature: +# CHECK-NEXT: Version: 1 +# CHECK-NEXT: Flags: 8 From f25fd64edca98367396ab747a512f03b6a8fadfa Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 13 Jan 2025 22:35:01 +0000 Subject: [PATCH 127/220] removing old test --- .../RootSignatures/FlagsElement.ll | 27 ------------------- 1 file changed, 27 deletions(-) delete mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll deleted file mode 100644 index 402f03a4dd589..0000000000000 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignatures/FlagsElement.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s -; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC - -target triple = "dxil-unknown-shadermodel6.0-compute" - -; CHECK: @dx.rts0 = private constant [8 x i8] c"{{.*}}", section "RTS0", align 4 - - -define void @main() #0 { -entry: - ret void -} - -attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } - - -!dx.rootsignatures = !{!2} ; list of function/root signature pairs -!2 = !{ ptr @main, !3 } ; function, root signature -!3 = !{ !4 } ; list of root signature elements -!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout - - -; DXC: - Name: RTS0 -; DXC-NEXT: Size: 8 -; DXC-NEXT: RootSignature: -; DXC-NEXT: Version: 1.0 -; DXC-NEXT: Flags: AllowInputAssemblerInputLayout From 16552f0ed20ad412d792500c2d0e2d793b4ade9e Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 13 Jan 2025 22:43:27 +0000 Subject: [PATCH 128/220] remove useless includes --- llvm/lib/Object/DXContainer.cpp | 1 - llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 9f2a50829ecc6..3c5d310d75a6d 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -10,7 +10,6 @@ #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index 90ee47cd46994..6ae0a0859b48e 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -7,11 +7,9 @@ //===----------------------------------------------------------------------===// #include "obj2yaml.h" -#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/DXContainer.h" #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/Support/Error.h" -#include "llvm/Support/ErrorHandling.h" #include From a6eb4c0404f1fad12654448bdacfbd6dec485004 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 14 Jan 2025 23:42:24 +0000 Subject: [PATCH 129/220] addressing comments --- .../BinaryFormat/DXContainerConstants.def | 3 +- .../include/llvm/ObjectYAML/DXContainerYAML.h | 5 +- llvm/lib/Object/DXContainer.cpp | 3 + llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 5 +- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 27 +- .../RootSignature-FlagsRootElement.yaml | 259 ++---------------- 6 files changed, 52 insertions(+), 250 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 6d44ea14df444..2a3f4b4fc5bd6 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -53,10 +53,9 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #undef SHADER_FEATURE_FLAG #endif // SHADER_FEATURE_FLAG - -// ROOT_ELEMENT_FLAG(bit offset for the flag, name). #ifdef ROOT_ELEMENT_FLAG + ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout) ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess) ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess) diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 9233f64b8e506..9724c4eda746b 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -74,13 +74,14 @@ struct ShaderHash { }; - +#define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; struct RootSignatureDesc { RootSignatureDesc() = default; RootSignatureDesc(const dxbc::RootSignatureDesc &Data); + uint32_t getEncodedFlags(); uint32_t Version; - uint32_t Flags; +#include "llvm/BinaryFormat/DXContainerConstants.def" }; using ResourceFlags = dxbc::PSV::ResourceFlags; diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 3c5d310d75a6d..1465128e04f86 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -8,6 +8,7 @@ #include "llvm/Object/DXContainer.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/FormatVariadic.h" @@ -97,6 +98,8 @@ Error DXContainer::parseHash(StringRef Part) { } Error DXContainer::parseRootSignature(StringRef Part) { + if (RootSignature) + return parseFailed("More than one RTS0 part is present in the file"); dxbc::RootSignatureDesc Desc; if (Error Err = readStruct(Part, Part.begin(), Desc)) return Err; diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index bca55782fad98..eb5dc95deede6 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -265,7 +265,10 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { case dxbc::PartType::RTS0: if (!P.RootSignature.has_value()) continue; - dxbc::RootSignatureDesc RS = {P.RootSignature->Version, P.RootSignature->Flags}; + uint32_t Flags = P.RootSignature->getEncodedFlags(); + if (sys::IsBigEndianHost) + sys::swapByteOrder(Flags); + dxbc::RootSignatureDesc RS = {P.RootSignature->Version, Flags}; if (sys::IsBigEndianHost) RS.swapBytes(); OS.write(reinterpret_cast(&RS), sizeof(dxbc::RootSignatureDesc)); diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 22ab123152232..d226a5b2e5942 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/ScopedPrinter.h" +#include namespace llvm { @@ -29,21 +30,17 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { #include "llvm/BinaryFormat/DXContainerConstants.def" } -DXContainerYAML::RootSignatureYamlDesc::RootSignatureYamlDesc( - const object::DirectX::RootSignature &Data) - : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), - RootParametersOffset(Data.getRootParametersOffset()), - NumStaticSamplers(Data.getNumStaticSamplers()), - StaticSamplersOffset(Data.getStaticSamplersOffset()) { - uint32_t Flags = Data.getFlags(); -#define ROOT_ELEMENT_FLAG(Num, Val) \ - Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; + +DXContainerYAML::RootSignatureDesc::RootSignatureDesc(const dxbc::RootSignatureDesc &Data): Version(Data.Version) { +#define ROOT_ELEMENT_FLAG(Num, Val) \ + Val = (Data.Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" } -uint32_t DXContainerYAML::RootSignatureYamlDesc::getEncodedFlags() { + +uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { uint64_t Flag = 0; -#define ROOT_ELEMENT_FLAG(Num, Val) \ +#define ROOT_ELEMENT_FLAG(Num, Val) \ if (Val) \ Flag |= (uint32_t)dxbc::RootElementFlag::Val; #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -66,10 +63,6 @@ DXContainerYAML::ShaderHash::ShaderHash(const dxbc::ShaderHash &Data) memcpy(Digest.data(), &Data.Digest[0], 16); } -DXContainerYAML::RootSignatureDesc::RootSignatureDesc(const dxbc::RootSignatureDesc &Data) - : Version(Data.Version), Flags(Data.Flags) { -} - DXContainerYAML::PSVInfo::PSVInfo() : Version(0) { memset(&Info, 0, sizeof(Info)); } @@ -216,7 +209,9 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureDesc &S) { IO.mapRequired("Version", S.Version); - IO.mapRequired("Flags", S.Flags); + #define ROOT_ELEMENT_FLAG(Num, Val) \ + IO.mapRequired(#Val, S.Val); + #include "llvm/BinaryFormat/DXContainerConstants.def" } void MappingTraits::mapping(IO &IO, diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml index 5435c432a073e..2ed71091cacd4 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml @@ -6,237 +6,38 @@ Header: Version: Major: 1 Minor: 0 - FileSize: 1672 - PartCount: 7 - PartOffsets: [ 60, 1496, 1512, 1540, 1556, 1572, 1588 ] + PartCount: 1 + PartOffsets: [ 60 ] Parts: - - Name: DXIL - Size: 1428 - Program: - MajorVersion: 6 - MinorVersion: 0 - ShaderKind: 5 - Size: 357 - DXILMajorVersion: 1 - DXILMinorVersion: 0 - DXILSize: 1404 - DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0x5C, - 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0, - 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23, - 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32, - 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19, - 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2, - 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14, - 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88, - 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5, - 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90, - 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C, - 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6, - 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x32, - 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13, 0x22, - 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84, 0xA1, - 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB, 0x84, - 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8, 0xC1, - 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40, 0x14, - 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54, 0x11, - 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30, 0x2, 0x50, - 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23, 0x0, 0x3, 0x0, - 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74, 0x60, 0x87, - 0x36, 0x68, 0x87, 0x79, 0x68, 0x3, 0x72, 0xC0, - 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D, 0xD0, 0xE, 0x7A, - 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A, 0x30, 0x7, 0x72, - 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x71, - 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x78, - 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9, 0x10, 0x7, 0x76, - 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D, 0x90, 0xE, 0x73, - 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72, 0xD0, 0x6, 0xE9, - 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76, 0x40, 0x7, 0x6D, - 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A, 0x10, 0x7, 0x76, - 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, - 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76, 0x40, 0x7, 0x7A, - 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE, 0x80, 0x7, 0x7A, - 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x7A, - 0x60, 0x7, 0x74, 0x30, 0xE4, 0x21, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xB, - 0x4, 0x6, 0x0, 0x0, 0x0, 0x32, 0x1E, 0x98, 0xC, - 0x19, 0x11, 0x4C, 0x90, 0x8C, 0x9, 0x26, 0x47, - 0xC6, 0x4, 0x43, 0xBA, 0x12, 0x28, 0x86, 0x11, - 0x80, 0x42, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xCB, - 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1, - 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38, - 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78, - 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED, - 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E, - 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30, - 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B, - 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3, - 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B, - 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A, - 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87, - 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1, - 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0, - 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21, - 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66, - 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43, - 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84, - 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7, - 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68, - 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70, - 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76, - 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87, - 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, - 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE, - 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8, - 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C, - 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21, - 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6, - 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43, - 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94, - 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F, - 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3, - 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70, - 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7, - 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0, - 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2, - 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1, - 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28, - 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2, - 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C, - 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18, - 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B, - 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68, - 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78, - 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50, - 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C, - 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1, - 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0, - 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3, - 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC, - 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B, - 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98, - 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A, - 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10, - 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30, - 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17, - 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48, - 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B, - 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3, - 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4, - 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76, - 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87, - 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED, - 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5, - 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3, - 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9, - 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23, - 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8, - 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D, - 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21, - 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D, - 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58, - 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A, - 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87, - 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1, - 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5, - 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18, - 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74, - 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81, - 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43, - 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0, - 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE, - 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1, - 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5, - 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73, - 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87, - 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3, - 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC, - 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48, - 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D, - 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C, - 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0, - 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40, - 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0, - 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68, - 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8, - 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38, - 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83, - 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94, - 0x3, 0x0, 0x0, 0x0, 0x0, 0x79, 0x28, 0x0, 0x0, - 0x2A, 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, - 0x10, 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, - 0x2, 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, - 0xC6, 0x0, 0x13, 0x74, 0x12, 0xA9, 0xB7, 0x37, - 0x3A, 0x23, 0xB6, 0xB0, 0xB3, 0xB9, 0x23, 0x8C, - 0xCD, 0x1D, 0xA2, 0x2D, 0x2C, 0xCD, 0x6D, 0x8, - 0x42, 0x1, 0xC, 0x41, 0x38, 0x82, 0x21, 0x8, 0x87, - 0x30, 0x4, 0xE1, 0x18, 0x86, 0x20, 0x1C, 0xC4, - 0x18, 0x84, 0xA0, 0x18, 0x43, 0x90, 0x8C, 0x41, - 0x20, 0x94, 0x31, 0xC, 0x82, 0x71, 0x8C, 0x41, - 0x28, 0x8E, 0x31, 0xC, 0x45, 0x51, 0x8C, 0x41, - 0x40, 0x9C, 0x31, 0x14, 0xC4, 0x0, 0x0, 0x8F, - 0x89, 0xC8, 0xF0, 0x5C, 0xE4, 0xDE, 0xDE, 0xE8, - 0xE6, 0xD2, 0xCE, 0xDC, 0xC2, 0xE8, 0xEA, 0xE4, - 0xCA, 0xE6, 0x86, 0x12, 0x28, 0xC6, 0x21, 0xC3, - 0x73, 0x99, 0x43, 0xB, 0x23, 0x2B, 0x93, 0x6B, - 0x7A, 0x23, 0x2B, 0x63, 0x1B, 0x4A, 0xB0, 0x18, - 0x85, 0xC, 0xCF, 0xC5, 0xAE, 0x4C, 0x6E, 0x2E, - 0xED, 0xCD, 0x6D, 0x28, 0x1, 0x63, 0x1C, 0x32, - 0x3C, 0x97, 0x32, 0x37, 0x3A, 0xB9, 0x3C, 0xA8, - 0xB7, 0x34, 0x37, 0xBA, 0xB9, 0xA1, 0x4, 0xF, - 0x0, 0x0, 0x71, 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, - 0x0, 0x6, 0x40, 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, - 0x61, 0x20, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, - 0x4, 0x1, 0x86, 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, - 0x0, 0x0, 0x7, 0x50, 0x10, 0xCD, 0x14, 0x61, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] - - Name: SFI0 - Size: 8 - - Name: HASH - Size: 20 - Hash: - IncludesSource: false - Digest: [ 0xCE, 0xA, 0x5B, 0x9C, 0xBF, 0x9A, 0xBB, 0x5, - 0x19, 0xC5, 0x96, 0x78, 0x49, 0x89, 0x5C, 0x6B ] - - Name: ISG1 - Size: 8 - Signature: - Parameters: [] - - Name: OSG1 - Size: 8 - Signature: - Parameters: [] - Name: RTS0 Size: 8 RootSignature: Version: 1 - Flags: 8 - - Name: PSV0 - Size: 76 - PSVInfo: - Version: 3 - ShaderStage: 5 - MinimumWaveLaneCount: 0 - MaximumWaveLaneCount: 4294967295 - UsesViewID: 0 - SigInputVectors: 0 - SigOutputVectors: [ 0, 0, 0, 0 ] - NumThreadsX: 1 - NumThreadsY: 1 - NumThreadsZ: 1 - EntryName: main - ResourceStride: 24 - Resources: [] - SigInputElements: [] - SigOutputElements: [] - SigPatchOrPrimElements: [] - InputOutputMap: - - [ ] - - [ ] - - [ ] - - [ ] - -# CHECK: - Name: RTS0 -# CHECK-NEXT: Size: 8 -# CHECK-NEXT: RootSignature: -# CHECK-NEXT: Version: 1 -# CHECK-NEXT: Flags: 8 + AllowInputAssemblerInputLayout: true + DenyVertexShaderRootAccess: false + DenyHullShaderRootAccess: false + DenyDomainShaderRootAccess: false + DenyGeometryShaderRootAccess: false + DenyPixelShaderRootAccess: false + AllowStreamOutput: false + LocalRootSignature: false + DenyAmplificationShaderRootAccess: false + DenyMeshShaderRootAccess: false + CBVSRVUAVHeapDirectlyIndexed: false + SamplerHeapDirectlyIndexed: false +#CHECK: - Name: RTS0 +#CHECK-NEXT: Size: 8 +#CHECK-NEXT: RootSignature: +#CHECK-NEXT: Version: 1 +#CHECK-NEXT: AllowInputAssemblerInputLayout: true +#CHECK-NEXT: DenyVertexShaderRootAccess: false +#CHECK-NEXT: DenyHullShaderRootAccess: false +#CHECK-NEXT: DenyDomainShaderRootAccess: false +#CHECK-NEXT: DenyGeometryShaderRootAccess: false +#CHECK-NEXT: DenyPixelShaderRootAccess: false +#CHECK-NEXT: AllowStreamOutput: false +#CHECK-NEXT: LocalRootSignature: false +#CHECK-NEXT: DenyAmplificationShaderRootAccess: false +#CHECK-NEXT: DenyMeshShaderRootAccess: false +#CHECK-NEXT: CBVSRVUAVHeapDirectlyIndexed: false +#CHECK-NEXT: SamplerHeapDirectlyIndexed: false From 7d080b307cf2b5650a9771f2b97cd007ad4ecdac Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 14 Jan 2025 23:47:46 +0000 Subject: [PATCH 130/220] updating test --- .../ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml index 2ed71091cacd4..8ce18d8e1aa4c 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml @@ -25,6 +25,7 @@ Parts: DenyMeshShaderRootAccess: false CBVSRVUAVHeapDirectlyIndexed: false SamplerHeapDirectlyIndexed: false + #CHECK: - Name: RTS0 #CHECK-NEXT: Size: 8 #CHECK-NEXT: RootSignature: From 504527b033790ae2df94ec6978e9cf1ad8e6e56f Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 15 Jan 2025 00:42:39 +0000 Subject: [PATCH 131/220] removing useless header --- llvm/lib/Object/DXContainer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 1465128e04f86..4263322ef0aee 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -8,7 +8,6 @@ #include "llvm/Object/DXContainer.h" #include "llvm/BinaryFormat/DXContainer.h" -#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/FormatVariadic.h" From 242567225f435c604e59f305366f819169713a84 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 15 Jan 2025 17:30:00 +0000 Subject: [PATCH 132/220] fix formating --- llvm/include/llvm/BinaryFormat/DXContainer.h | 4 ++-- llvm/include/llvm/Object/DXContainer.h | 3 +-- llvm/include/llvm/ObjectYAML/DXContainerYAML.h | 1 - llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 2 +- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 15 +++++++-------- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index ad39d55e55057..addd2824da2d0 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -66,9 +66,9 @@ struct ShaderHash { struct RootSignatureDesc { uint32_t Version; uint32_t Flags; - void swapBytes() { + void swapBytes() { sys::swapByteOrder(Version); - sys::swapByteOrder(Flags); + sys::swapByteOrder(Flags); } }; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index d7a397b608513..4ad4db7fe215f 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -406,8 +406,7 @@ class DXContainer { std::optional getShaderHash() const { return Hash; } - std::optional - getRootSignature() const { + std::optional getRootSignature() const { return RootSignature; } diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 9724c4eda746b..75eac9630168c 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -73,7 +73,6 @@ struct ShaderHash { std::vector Digest; }; - #define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; struct RootSignatureDesc { RootSignatureDesc() = default; diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index eb5dc95deede6..d97007699a0fd 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -268,7 +268,7 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { uint32_t Flags = P.RootSignature->getEncodedFlags(); if (sys::IsBigEndianHost) sys::swapByteOrder(Flags); - dxbc::RootSignatureDesc RS = {P.RootSignature->Version, Flags}; + dxbc::RootSignatureDesc RS = {P.RootSignature->Version, Flags}; if (sys::IsBigEndianHost) RS.swapBytes(); OS.write(reinterpret_cast(&RS), sizeof(dxbc::RootSignatureDesc)); diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index d226a5b2e5942..80f4587a06ff5 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -30,17 +30,17 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { #include "llvm/BinaryFormat/DXContainerConstants.def" } - -DXContainerYAML::RootSignatureDesc::RootSignatureDesc(const dxbc::RootSignatureDesc &Data): Version(Data.Version) { -#define ROOT_ELEMENT_FLAG(Num, Val) \ +DXContainerYAML::RootSignatureDesc::RootSignatureDesc( + const dxbc::RootSignatureDesc &Data) + : Version(Data.Version) { +#define ROOT_ELEMENT_FLAG(Num, Val) \ Val = (Data.Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" } - uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { uint64_t Flag = 0; -#define ROOT_ELEMENT_FLAG(Num, Val) \ +#define ROOT_ELEMENT_FLAG(Num, Val) \ if (Val) \ Flag |= (uint32_t)dxbc::RootElementFlag::Val; #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -209,9 +209,8 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureDesc &S) { IO.mapRequired("Version", S.Version); - #define ROOT_ELEMENT_FLAG(Num, Val) \ - IO.mapRequired(#Val, S.Val); - #include "llvm/BinaryFormat/DXContainerConstants.def" +#define ROOT_ELEMENT_FLAG(Num, Val) IO.mapRequired(#Val, S.Val); +#include "llvm/BinaryFormat/DXContainerConstants.def" } void MappingTraits::mapping(IO &IO, From 08c17bbde586f7758381944892902970a0847528 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 15 Jan 2025 18:21:06 +0000 Subject: [PATCH 133/220] renaming test --- .../DXContainer/RootSignature-Flags.yaml | 49 ++++++++++++------- .../RootSignature-FlagsRootElement.yaml | 44 ----------------- 2 files changed, 30 insertions(+), 63 deletions(-) delete mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index b0a3e6945f454..8ce18d8e1aa4c 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -1,5 +1,4 @@ # RUN: yaml2obj %s | obj2yaml | FileCheck %s - --- !dxcontainer Header: Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @@ -11,23 +10,35 @@ Header: PartOffsets: [ 60 ] Parts: - Name: RTS0 - Size: 24 + Size: 8 RootSignature: - Version: 2 - NumParameters: 1 - RootParametersOffset: 3 - NumStaticSamplers: 4 - StaticSamplersOffset: 5 + Version: 1 AllowInputAssemblerInputLayout: true - DenyGeometryShaderRootAccess: true - -# CHECK: - Name: RTS0 -# CHECK-NEXT: Size: 24 -# CHECK-NEXT: RootSignature: -# CHECK-NEXT: Version: 2 -# CHECK-NEXT: NumParameters: 1 -# CHECK-NEXT: RootParametersOffset: 3 -# CHECK-NEXT: NumStaticSamplers: 4 -# CHECK-NEXT: StaticSamplersOffset: 5 -# CHECK-NEXT: AllowInputAssemblerInputLayout: true -# CHECK-NEXT: DenyGeometryShaderRootAccess: true + DenyVertexShaderRootAccess: false + DenyHullShaderRootAccess: false + DenyDomainShaderRootAccess: false + DenyGeometryShaderRootAccess: false + DenyPixelShaderRootAccess: false + AllowStreamOutput: false + LocalRootSignature: false + DenyAmplificationShaderRootAccess: false + DenyMeshShaderRootAccess: false + CBVSRVUAVHeapDirectlyIndexed: false + SamplerHeapDirectlyIndexed: false + +#CHECK: - Name: RTS0 +#CHECK-NEXT: Size: 8 +#CHECK-NEXT: RootSignature: +#CHECK-NEXT: Version: 1 +#CHECK-NEXT: AllowInputAssemblerInputLayout: true +#CHECK-NEXT: DenyVertexShaderRootAccess: false +#CHECK-NEXT: DenyHullShaderRootAccess: false +#CHECK-NEXT: DenyDomainShaderRootAccess: false +#CHECK-NEXT: DenyGeometryShaderRootAccess: false +#CHECK-NEXT: DenyPixelShaderRootAccess: false +#CHECK-NEXT: AllowStreamOutput: false +#CHECK-NEXT: LocalRootSignature: false +#CHECK-NEXT: DenyAmplificationShaderRootAccess: false +#CHECK-NEXT: DenyMeshShaderRootAccess: false +#CHECK-NEXT: CBVSRVUAVHeapDirectlyIndexed: false +#CHECK-NEXT: SamplerHeapDirectlyIndexed: false diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml deleted file mode 100644 index 8ce18d8e1aa4c..0000000000000 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-FlagsRootElement.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# RUN: yaml2obj %s | obj2yaml | FileCheck %s ---- !dxcontainer -Header: - Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] - Version: - Major: 1 - Minor: 0 - PartCount: 1 - PartOffsets: [ 60 ] -Parts: - - Name: RTS0 - Size: 8 - RootSignature: - Version: 1 - AllowInputAssemblerInputLayout: true - DenyVertexShaderRootAccess: false - DenyHullShaderRootAccess: false - DenyDomainShaderRootAccess: false - DenyGeometryShaderRootAccess: false - DenyPixelShaderRootAccess: false - AllowStreamOutput: false - LocalRootSignature: false - DenyAmplificationShaderRootAccess: false - DenyMeshShaderRootAccess: false - CBVSRVUAVHeapDirectlyIndexed: false - SamplerHeapDirectlyIndexed: false - -#CHECK: - Name: RTS0 -#CHECK-NEXT: Size: 8 -#CHECK-NEXT: RootSignature: -#CHECK-NEXT: Version: 1 -#CHECK-NEXT: AllowInputAssemblerInputLayout: true -#CHECK-NEXT: DenyVertexShaderRootAccess: false -#CHECK-NEXT: DenyHullShaderRootAccess: false -#CHECK-NEXT: DenyDomainShaderRootAccess: false -#CHECK-NEXT: DenyGeometryShaderRootAccess: false -#CHECK-NEXT: DenyPixelShaderRootAccess: false -#CHECK-NEXT: AllowStreamOutput: false -#CHECK-NEXT: LocalRootSignature: false -#CHECK-NEXT: DenyAmplificationShaderRootAccess: false -#CHECK-NEXT: DenyMeshShaderRootAccess: false -#CHECK-NEXT: CBVSRVUAVHeapDirectlyIndexed: false -#CHECK-NEXT: SamplerHeapDirectlyIndexed: false From 395a5367e10af0bc2a7b78997e6641e0e8ca70d1 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 22:16:45 +0000 Subject: [PATCH 134/220] addressing pr comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 2 +- .../BinaryFormat/DXContainerConstants.def | 24 +++++++++---------- .../include/llvm/ObjectYAML/DXContainerYAML.h | 2 +- llvm/lib/Object/DXContainer.cpp | 1 - llvm/lib/ObjectYAML/DXContainerYAML.cpp | 6 ++--- llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 3 +-- 6 files changed, 18 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index addd2824da2d0..a088cc158a6c8 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -161,7 +161,7 @@ enum class FeatureFlags : uint64_t { static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63, "Shader flag bits exceed enum size."); -#define ROOT_ELEMENT_FLAG(Num, Val) Val = 1ull << Num, +#define ROOT_ELEMENT_FLAG(Num, Val, Str) Val = 1ull << Num, enum class RootElementFlag : uint32_t { #include "DXContainerConstants.def" }; diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 2a3f4b4fc5bd6..e5e92c1af2476 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -56,18 +56,18 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #ifdef ROOT_ELEMENT_FLAG -ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout) -ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess) -ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess) -ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess) -ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess) -ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess) -ROOT_ELEMENT_FLAG(6, AllowStreamOutput) -ROOT_ELEMENT_FLAG(7, LocalRootSignature) -ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess) -ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess) -ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed) -ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) +ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout, "") +ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess, "") +ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess, "") +ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess, "") +ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess, "") +ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess, "") +ROOT_ELEMENT_FLAG(6, AllowStreamOutput, "") +ROOT_ELEMENT_FLAG(7, LocalRootSignature, "") +ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess, "") +ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess, "") +ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed, "") +ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed, "") #undef ROOT_ELEMENT_FLAG #endif // ROOT_ELEMENT_FLAG diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 75eac9630168c..77a7724c64a00 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -73,7 +73,7 @@ struct ShaderHash { std::vector Digest; }; -#define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; +#define ROOT_ELEMENT_FLAG(Num, Val, Str) bool Val = false; struct RootSignatureDesc { RootSignatureDesc() = default; RootSignatureDesc(const dxbc::RootSignatureDesc &Data); diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 4263322ef0aee..432dc8386a535 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -210,7 +210,6 @@ Error DXContainer::parsePartOffsets() { case dxbc::PartType::RTS0: if (Error Err = parseRootSignature(PartData)) return Err; - break; } } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 80f4587a06ff5..682216e5febec 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -33,14 +33,14 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureDesc::RootSignatureDesc( const dxbc::RootSignatureDesc &Data) : Version(Data.Version) { -#define ROOT_ELEMENT_FLAG(Num, Val) \ +#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ Val = (Data.Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" } uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { uint64_t Flag = 0; -#define ROOT_ELEMENT_FLAG(Num, Val) \ +#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ if (Val) \ Flag |= (uint32_t)dxbc::RootElementFlag::Val; #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -209,7 +209,7 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureDesc &S) { IO.mapRequired("Version", S.Version); -#define ROOT_ELEMENT_FLAG(Num, Val) IO.mapRequired(#Val, S.Val); +#define ROOT_ELEMENT_FLAG(Num, Val, Str) IO.mapRequired(#Val, S.Val); #include "llvm/BinaryFormat/DXContainerConstants.def" } diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index 6ae0a0859b48e..9588a8277dad7 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -155,10 +155,9 @@ dumpDXContainer(MemoryBufferRef Source) { break; case dxbc::PartType::RTS0: std::optional RS = Container.getRootSignature(); - if (RS && RS.has_value()) + if (RS.has_value()) NewPart.RootSignature = DXContainerYAML::RootSignatureDesc(*RS); break; - break; } } From f93dff923fc74e69cf534c6f370141b5cef662d6 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 22:20:18 +0000 Subject: [PATCH 135/220] adding str to ROOT_ELEMENT_FLAG --- .../BinaryFormat/DXContainerConstants.def | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index e5e92c1af2476..fd35471157d0c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -56,18 +56,18 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #ifdef ROOT_ELEMENT_FLAG -ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout, "") -ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess, "") -ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess, "") -ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess, "") -ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess, "") -ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess, "") -ROOT_ELEMENT_FLAG(6, AllowStreamOutput, "") -ROOT_ELEMENT_FLAG(7, LocalRootSignature, "") -ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess, "") -ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess, "") -ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed, "") -ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed, "") +ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout, "The app is opting in to using the Input Assembler") +ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess, "Denies the vertex shader access to the root signature.") +ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess, "Denies the hull shader access to the root signature.") +ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess, "Denies the domain shader access to the root signature.") +ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess, "Denies the geometry shader access to the root signature.") +ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess, "Denies the pixel shader access to the root signature.") +ROOT_ELEMENT_FLAG(6, AllowStreamOutput, "The app is opting in to using Stream Output.") +ROOT_ELEMENT_FLAG(7, LocalRootSignature, "The root signature is to be used with raytracing shaders to define resource bindings sourced from shader records in shader tables.") +ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess, "Denies the amplification shader access to the root signature.") +ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess, "Denies the mesh shader access to the root signature.") +ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed, "The shaders are allowed to index the CBV/SRV/UAV descriptor heap directly, using the ResourceDescriptorHeap built-in variable.") +ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed, "The shaders are allowed to index the sampler descriptor heap directly, using the SamplerDescriptorHeap built-in variable.") #undef ROOT_ELEMENT_FLAG #endif // ROOT_ELEMENT_FLAG From 1c1edb8e26a0809f5fe4a69f1ba77aebd4c43c6c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 17 Jan 2025 07:28:09 +0000 Subject: [PATCH 136/220] formating --- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 682216e5febec..0351239cac2c1 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -33,14 +33,14 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureDesc::RootSignatureDesc( const dxbc::RootSignatureDesc &Data) : Version(Data.Version) { -#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ +#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ Val = (Data.Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" } uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { uint64_t Flag = 0; -#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ +#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ if (Val) \ Flag |= (uint32_t)dxbc::RootElementFlag::Val; #include "llvm/BinaryFormat/DXContainerConstants.def" From 5bef775af1ff725651b5d796cecf69befaa09cf9 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 18 Jan 2025 00:03:15 +0000 Subject: [PATCH 137/220] refactoring to follow llvm standards --- .../llvm/MC/DXContainerRootSignature.h | 17 +++---- llvm/include/llvm/Object/DXContainer.h | 21 ++++---- .../include/llvm/ObjectYAML/DXContainerYAML.h | 2 +- llvm/lib/MC/DXContainerRootSignature.cpp | 22 +++++--- llvm/lib/Object/DXContainer.cpp | 51 ++++--------------- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 13 +++-- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 8 +-- llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 2 +- 8 files changed, 54 insertions(+), 82 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index e414112498798..3926193697a49 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -1,4 +1,5 @@ -//===- llvm/MC/DXContainerRootSignature.h - RootSignature -*- C++ -*- ========// +//===- llvm/MC/DXContainerRootSignature.h - DXContainer RootSignature -*- C++ +//-------*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,15 +15,13 @@ namespace llvm { class raw_ostream; namespace mcdxbc { -struct RootSignatureDesc { - uint32_t Version = 2; - uint32_t NumParameters = 0; - uint32_t RootParametersOffset = 0; - uint32_t NumStaticSamplers = 0; - uint32_t StaticSamplersOffset = 0; - uint32_t Flags = 0; +struct RootSignatureHeader { + uint32_t Version; + uint32_t Flags; - void write(raw_ostream &OS) const; + void swapBytes(); + void write(raw_ostream &OS, + uint32_t Version = std::numeric_limits::max()); }; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 4ad4db7fe215f..0e93a0f1d9615 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -22,6 +22,8 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" #include +#include +#include #include namespace llvm { @@ -119,22 +121,17 @@ namespace DirectX { class RootSignature { private: + StringRef Data; uint32_t Version; - uint32_t NumParameters; - uint32_t RootParametersOffset; - uint32_t NumStaticSamplers; - uint32_t StaticSamplersOffset; uint32_t Flags; public: - RootSignature() {} + RootSignature(StringRef Data) : Data(Data) {} + + Error parse(); - Error parse(StringRef Data); uint32_t getVersion() const { return Version; } - uint32_t getNumParameters() const { return NumParameters; } - uint32_t getRootParametersOffset() const { return RootParametersOffset; } - uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } - uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } + uint32_t getFlags() const { return Flags; } }; @@ -309,7 +306,7 @@ class DXContainer { std::optional ShaderFeatureFlags; std::optional Hash; std::optional PSVInfo; - std::optional RootSignature; + std::optional RootSignature; DirectX::Signature InputSignature; DirectX::Signature OutputSignature; DirectX::Signature PatchConstantSignature; @@ -406,7 +403,7 @@ class DXContainer { std::optional getShaderHash() const { return Hash; } - std::optional getRootSignature() const { + std::optional getRootSignature() const { return RootSignature; } diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 77a7724c64a00..eb514c1976759 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -76,7 +76,7 @@ struct ShaderHash { #define ROOT_ELEMENT_FLAG(Num, Val, Str) bool Val = false; struct RootSignatureDesc { RootSignatureDesc() = default; - RootSignatureDesc(const dxbc::RootSignatureDesc &Data); + RootSignatureDesc(const object::DirectX::RootSignature &Data); uint32_t getEncodedFlags(); uint32_t Version; diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index b6f2b85bac74e..331d5131fce7b 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -1,4 +1,5 @@ -//===- llvm/MC/DXContainerRootSignature.cpp - RootSignature -*- C++ -*-=======// +//===- llvm/MC/DXContainerRootSignature.cpp - DXContainer RootSignature -*- C++ +//-------*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,16 +9,21 @@ #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Support/EndianStream.h" +#include "llvm/Support/SwapByteOrder.h" +#include using namespace llvm; using namespace llvm::mcdxbc; -void RootSignatureDesc::write(raw_ostream &OS) const { +void RootSignatureHeader::write(raw_ostream &OS, uint32_t Version) { - support::endian::write(OS, Version, llvm::endianness::little); - support::endian::write(OS, NumParameters, llvm::endianness::little); - support::endian::write(OS, RootParametersOffset, llvm::endianness::little); - support::endian::write(OS, NumStaticSamplers, llvm::endianness::little); - support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); - support::endian::write(OS, Flags, llvm::endianness::little); + uint32_t SizeInfo = sizeof(this); + // support::endian::write(OS, SizeInfo, llvm::endianness::little); + + if (sys::IsBigEndianHost) { + sys::swapByteOrder(Version); + sys::swapByteOrder(Flags); + } + + OS.write(reinterpret_cast(this), SizeInfo); } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 432dc8386a535..28fc0d1e11add 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -99,8 +99,8 @@ Error DXContainer::parseHash(StringRef Part) { Error DXContainer::parseRootSignature(StringRef Part) { if (RootSignature) return parseFailed("More than one RTS0 part is present in the file"); - dxbc::RootSignatureDesc Desc; - if (Error Err = readStruct(Part, Part.begin(), Desc)) + DirectX::RootSignature Desc(Part); + if (Error Err = Desc.parse()) return Err; RootSignature = Desc; return Error::success(); @@ -246,48 +246,17 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { IteratorState.Offset = Offset; } -Error DirectX::RootSignature::parse(StringRef Data) { +Error DirectX::RootSignature::parse() { const char *Current = Data.begin(); + dxbc::RootSignatureDesc Desc; + if (Error Err = readStruct(Data, Current, Desc)) + return Err; - // Root Signature headers expects 6 integers to be present. - if (Data.size() < 6 * sizeof(uint32_t)) - return parseFailed( - "Invalid root signature, insufficient space for header."); - - uint32_t VValue = - support::endian::read(Current); - Current += sizeof(uint32_t); - - if (!dxbc::RootSignatureValidations::isValidVersion(VValue)) - return validationFailed("unsupported root signature version read: " + - llvm::Twine(VValue)); - Version = VValue; - - NumParameters = - support::endian::read(Current); - Current += sizeof(uint32_t); - - RootParametersOffset = - support::endian::read(Current); - Current += sizeof(uint32_t); - - NumStaticSamplers = - support::endian::read(Current); - Current += sizeof(uint32_t); - - StaticSamplersOffset = - support::endian::read(Current); - Current += sizeof(uint32_t); - - uint32_t FValue = - support::endian::read(Current); - Current += sizeof(uint32_t); - - if (!dxbc::RootSignatureValidations::isValidRootFlag(FValue)) - return validationFailed("unsupported root signature flag value read: " + - llvm::Twine(FValue)); - Flags = FValue; + if (sys::IsBigEndianHost) + Desc.swapBytes(); + Version = Desc.Version; + Flags = Desc.Flags; return Error::success(); } diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index d97007699a0fd..0504f6b88a7db 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -265,13 +265,12 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { case dxbc::PartType::RTS0: if (!P.RootSignature.has_value()) continue; - uint32_t Flags = P.RootSignature->getEncodedFlags(); - if (sys::IsBigEndianHost) - sys::swapByteOrder(Flags); - dxbc::RootSignatureDesc RS = {P.RootSignature->Version, Flags}; - if (sys::IsBigEndianHost) - RS.swapBytes(); - OS.write(reinterpret_cast(&RS), sizeof(dxbc::RootSignatureDesc)); + + mcdxbc::RootSignatureHeader Header; + Header.Version = P.RootSignature->Version; + Header.Flags = P.RootSignature->getEncodedFlags(); + + Header.write(OS); break; } uint64_t BytesWritten = OS.tell() - DataStart; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 0351239cac2c1..aeae3d9f3958a 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -14,6 +14,7 @@ #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Object/DXContainer.h" #include "llvm/Support/ScopedPrinter.h" #include @@ -31,10 +32,11 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { } DXContainerYAML::RootSignatureDesc::RootSignatureDesc( - const dxbc::RootSignatureDesc &Data) - : Version(Data.Version) { + const object::DirectX::RootSignature &Data) + : Version(Data.getVersion()) { + uint32_t Flags = Data.getFlags(); #define ROOT_ELEMENT_FLAG(Num, Val, Str) \ - Val = (Data.Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; + Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" } diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index 9588a8277dad7..54a912d9438af 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -154,7 +154,7 @@ dumpDXContainer(MemoryBufferRef Source) { case dxbc::PartType::Unknown: break; case dxbc::PartType::RTS0: - std::optional RS = Container.getRootSignature(); + std::optional RS = Container.getRootSignature(); if (RS.has_value()) NewPart.RootSignature = DXContainerYAML::RootSignatureDesc(*RS); break; From 628545a18add2564d40ac8330796a3adbf359e2a Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 27 Jan 2025 21:13:51 +0000 Subject: [PATCH 138/220] addressing comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 31 +++++++++++++++++++ .../BinaryFormat/DXContainerConstants.def | 25 ++++++++++++++- .../llvm/MC/DXContainerRootSignature.h | 3 +- llvm/include/llvm/Object/DXContainer.h | 3 ++ .../include/llvm/ObjectYAML/DXContainerYAML.h | 5 +++ llvm/lib/MC/DXContainerRootSignature.cpp | 16 +++------- llvm/lib/Object/DXContainer.cpp | 21 +++++++------ llvm/lib/ObjectYAML/DXContainerYAML.cpp | 5 +-- .../DXContainer/RootSignature-Flags.yaml | 26 ++-------------- 9 files changed, 86 insertions(+), 49 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index a088cc158a6c8..ad8fc080dd345 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -13,10 +13,12 @@ #ifndef LLVM_BINARYFORMAT_DXCONTAINER_H #define LLVM_BINARYFORMAT_DXCONTAINER_H +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" +#include #include namespace llvm { @@ -63,10 +65,39 @@ struct ShaderHash { void swapBytes() { sys::swapByteOrder(Flags); } }; +#define ROOT_PARAMETER(RootParameter) RootParameter, +enum class RootParameterType { +#include "DXContainerConstants.def" +}; + +#define SHADER_VISIBILITY(ShaderVisibility) ShaderVisibility, +enum class ShaderVisibilityFlag { +#include "DXContainerConstants.def" +}; + +struct RootConstants { + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; +}; + +struct RootParameter { + RootParameterType ParameterType; + union { + RootConstants Constants; + }; + ShaderVisibilityFlag ShaderVisibility; +}; + struct RootSignatureDesc { + uint32_t Size; uint32_t Version; uint32_t Flags; + uint32_t NumParameters; + RootParameter *Parameters; + void swapBytes() { + sys::swapByteOrder(Size); sys::swapByteOrder(Version); sys::swapByteOrder(Flags); } diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index fd35471157d0c..96d84fdc3faa3 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -53,8 +53,31 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #undef SHADER_FEATURE_FLAG #endif // SHADER_FEATURE_FLAG -#ifdef ROOT_ELEMENT_FLAG +#ifdef ROOT_PARAMETER + +ROOT_PARAMETER(DescriptorTable) +ROOT_PARAMETER(Constants32Bit) +ROOT_PARAMETER(CBV) +ROOT_PARAMETER(SRV) +ROOT_PARAMETER(UAV) +#undef ROOT_PARAMETER +#endif // ROOT_PARAMETER + + +#ifdef SHADER_VISIBILITY + +SHADER_VISIBILITY(All) +SHADER_VISIBILITY(Vertex) +SHADER_VISIBILITY(Hull) +SHADER_VISIBILITY(Domain) +SHADER_VISIBILITY(Geometry) +SHADER_VISIBILITY(Pixel) +SHADER_VISIBILITY(Amplification) +SHADER_VISIBILITY(Mesh) +#undef SHADER_VISIBILITY +#endif // SHADER_VISIBILITY +#ifdef ROOT_ELEMENT_FLAG ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout, "The app is opting in to using the Input Assembler") ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess, "Denies the vertex shader access to the root signature.") diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 3926193697a49..d83ae28ffd692 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -20,8 +20,7 @@ struct RootSignatureHeader { uint32_t Flags; void swapBytes(); - void write(raw_ostream &OS, - uint32_t Version = std::numeric_limits::max()); + void write(raw_ostream &OS); }; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 0e93a0f1d9615..07a3c872ac83d 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -122,6 +122,7 @@ namespace DirectX { class RootSignature { private: StringRef Data; + uint32_t Size; uint32_t Version; uint32_t Flags; @@ -130,6 +131,8 @@ class RootSignature { Error parse(); + uint32_t getSize() const { return Size; } + uint32_t getVersion() const { return Version; } uint32_t getFlags() const { return Flags; } diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index eb514c1976759..e9b318faee295 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -15,6 +15,7 @@ #ifndef LLVM_OBJECTYAML_DXCONTAINERYAML_H #define LLVM_OBJECTYAML_DXCONTAINERYAML_H +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/DXContainer.h" @@ -79,7 +80,11 @@ struct RootSignatureDesc { RootSignatureDesc(const object::DirectX::RootSignature &Data); uint32_t getEncodedFlags(); + uint32_t Size; uint32_t Version; + uint32_t NumParameters; + SmallVector Parameters; + #include "llvm/BinaryFormat/DXContainerConstants.def" }; diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 331d5131fce7b..0bb87c2cc3832 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -1,5 +1,4 @@ -//===- llvm/MC/DXContainerRootSignature.cpp - DXContainer RootSignature -*- C++ -//-------*-===// +//===- llvm/MC/DXContainerRootSignature.cpp - RootSignature -*- C++ -*-=======// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,15 +14,10 @@ using namespace llvm; using namespace llvm::mcdxbc; -void RootSignatureHeader::write(raw_ostream &OS, uint32_t Version) { +void RootSignatureHeader::write(raw_ostream &OS) { uint32_t SizeInfo = sizeof(this); - // support::endian::write(OS, SizeInfo, llvm::endianness::little); - - if (sys::IsBigEndianHost) { - sys::swapByteOrder(Version); - sys::swapByteOrder(Flags); - } - - OS.write(reinterpret_cast(this), SizeInfo); + support::endian::write(OS, SizeInfo, llvm::endianness::little); + support::endian::write(OS, Version, llvm::endianness::little); + support::endian::write(OS, Flags, llvm::endianness::little); } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 28fc0d1e11add..715bc7b0a2678 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -10,6 +10,7 @@ #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; @@ -99,10 +100,9 @@ Error DXContainer::parseHash(StringRef Part) { Error DXContainer::parseRootSignature(StringRef Part) { if (RootSignature) return parseFailed("More than one RTS0 part is present in the file"); - DirectX::RootSignature Desc(Part); - if (Error Err = Desc.parse()) + RootSignature = DirectX::RootSignature(Part); + if (Error Err = RootSignature->parse()) return Err; - RootSignature = Desc; return Error::success(); } @@ -248,15 +248,16 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { Error DirectX::RootSignature::parse() { const char *Current = Data.begin(); - dxbc::RootSignatureDesc Desc; - if (Error Err = readStruct(Data, Current, Desc)) - return Err; - if (sys::IsBigEndianHost) - Desc.swapBytes(); + Size = support::endian::read(Current); + Current += sizeof(uint32_t); + + Version = support::endian::read(Current); + Current += sizeof(uint32_t); + + Flags = support::endian::read(Current); + Current += sizeof(uint32_t); - Version = Desc.Version; - Flags = Desc.Flags; return Error::success(); } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index aeae3d9f3958a..f3febcb09400f 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -33,7 +33,7 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureDesc::RootSignatureDesc( const object::DirectX::RootSignature &Data) - : Version(Data.getVersion()) { + : Size(Data.getSize()), Version(Data.getVersion()) { uint32_t Flags = Data.getFlags(); #define ROOT_ELEMENT_FLAG(Num, Val, Str) \ Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; @@ -210,8 +210,9 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureDesc &S) { + IO.mapRequired("Size", S.Size); IO.mapRequired("Version", S.Version); -#define ROOT_ELEMENT_FLAG(Num, Val, Str) IO.mapRequired(#Val, S.Val); +#define ROOT_ELEMENT_FLAG(Num, Val, Str) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index 8ce18d8e1aa4c..6c0ccda2e4ca5 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -12,33 +12,13 @@ Parts: - Name: RTS0 Size: 8 RootSignature: + Size: 8 Version: 1 AllowInputAssemblerInputLayout: true - DenyVertexShaderRootAccess: false - DenyHullShaderRootAccess: false - DenyDomainShaderRootAccess: false - DenyGeometryShaderRootAccess: false - DenyPixelShaderRootAccess: false - AllowStreamOutput: false - LocalRootSignature: false - DenyAmplificationShaderRootAccess: false - DenyMeshShaderRootAccess: false - CBVSRVUAVHeapDirectlyIndexed: false - SamplerHeapDirectlyIndexed: false - + #CHECK: - Name: RTS0 #CHECK-NEXT: Size: 8 #CHECK-NEXT: RootSignature: +#CHECK-NEXT: Size: 8 #CHECK-NEXT: Version: 1 #CHECK-NEXT: AllowInputAssemblerInputLayout: true -#CHECK-NEXT: DenyVertexShaderRootAccess: false -#CHECK-NEXT: DenyHullShaderRootAccess: false -#CHECK-NEXT: DenyDomainShaderRootAccess: false -#CHECK-NEXT: DenyGeometryShaderRootAccess: false -#CHECK-NEXT: DenyPixelShaderRootAccess: false -#CHECK-NEXT: AllowStreamOutput: false -#CHECK-NEXT: LocalRootSignature: false -#CHECK-NEXT: DenyAmplificationShaderRootAccess: false -#CHECK-NEXT: DenyMeshShaderRootAccess: false -#CHECK-NEXT: CBVSRVUAVHeapDirectlyIndexed: false -#CHECK-NEXT: SamplerHeapDirectlyIndexed: false From c5c2ab6d1eb86cd614a847a10cb6f352e937e5c9 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 27 Jan 2025 22:04:58 +0000 Subject: [PATCH 139/220] clean up --- llvm/include/llvm/BinaryFormat/DXContainer.h | 2 -- llvm/include/llvm/MC/DXContainerRootSignature.h | 3 +-- llvm/include/llvm/Object/DXContainer.h | 2 -- llvm/include/llvm/ObjectYAML/DXContainerYAML.h | 1 - llvm/lib/ObjectYAML/DXContainerYAML.cpp | 1 - 5 files changed, 1 insertion(+), 8 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index ad8fc080dd345..942533d95ebea 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -13,12 +13,10 @@ #ifndef LLVM_BINARYFORMAT_DXCONTAINER_H #define LLVM_BINARYFORMAT_DXCONTAINER_H -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" -#include #include namespace llvm { diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index d83ae28ffd692..23de2709088c6 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -1,5 +1,4 @@ -//===- llvm/MC/DXContainerRootSignature.h - DXContainer RootSignature -*- C++ -//-------*-===// +//===- llvm/MC/DXContainerRootSignature.h - RootSignature -*- C++ -*- ========// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 07a3c872ac83d..290fbd6999186 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -22,8 +22,6 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" #include -#include -#include #include namespace llvm { diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index e9b318faee295..6b01f105a544b 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -15,7 +15,6 @@ #ifndef LLVM_OBJECTYAML_DXCONTAINERYAML_H #define LLVM_OBJECTYAML_DXCONTAINERYAML_H -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/DXContainer.h" diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index f3febcb09400f..985546872a8b3 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -16,7 +16,6 @@ #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/DXContainer.h" #include "llvm/Support/ScopedPrinter.h" -#include namespace llvm { From b9db72cb3ef75ff2d5637ff814030290b87c4e80 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 30 Jan 2025 22:43:23 +0000 Subject: [PATCH 140/220] remove version --- llvm/include/llvm/MC/DXContainerRootSignature.h | 1 - llvm/include/llvm/Object/DXContainer.h | 3 --- llvm/include/llvm/ObjectYAML/DXContainerYAML.h | 1 - llvm/lib/MC/DXContainerRootSignature.cpp | 1 - llvm/lib/Object/DXContainer.cpp | 3 --- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 1 - llvm/lib/ObjectYAML/DXContainerYAML.cpp | 3 +-- llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml | 2 -- 8 files changed, 1 insertion(+), 14 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 23de2709088c6..20b4f5a4285f6 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,7 +15,6 @@ class raw_ostream; namespace mcdxbc { struct RootSignatureHeader { - uint32_t Version; uint32_t Flags; void swapBytes(); diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 290fbd6999186..5f7737d2fa41d 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -121,7 +121,6 @@ class RootSignature { private: StringRef Data; uint32_t Size; - uint32_t Version; uint32_t Flags; public: @@ -131,8 +130,6 @@ class RootSignature { uint32_t getSize() const { return Size; } - uint32_t getVersion() const { return Version; } - uint32_t getFlags() const { return Flags; } }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 6b01f105a544b..9b3259f3bf6c6 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -80,7 +80,6 @@ struct RootSignatureDesc { uint32_t getEncodedFlags(); uint32_t Size; - uint32_t Version; uint32_t NumParameters; SmallVector Parameters; diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 0bb87c2cc3832..4e085654a1e5e 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -18,6 +18,5 @@ void RootSignatureHeader::write(raw_ostream &OS) { uint32_t SizeInfo = sizeof(this); support::endian::write(OS, SizeInfo, llvm::endianness::little); - support::endian::write(OS, Version, llvm::endianness::little); support::endian::write(OS, Flags, llvm::endianness::little); } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 715bc7b0a2678..a6b5346601a3d 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -252,9 +252,6 @@ Error DirectX::RootSignature::parse() { Size = support::endian::read(Current); Current += sizeof(uint32_t); - Version = support::endian::read(Current); - Current += sizeof(uint32_t); - Flags = support::endian::read(Current); Current += sizeof(uint32_t); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 0504f6b88a7db..ada7383ea3c6b 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -267,7 +267,6 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { continue; mcdxbc::RootSignatureHeader Header; - Header.Version = P.RootSignature->Version; Header.Flags = P.RootSignature->getEncodedFlags(); Header.write(OS); diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 985546872a8b3..fd85d75dc32eb 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -32,7 +32,7 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureDesc::RootSignatureDesc( const object::DirectX::RootSignature &Data) - : Size(Data.getSize()), Version(Data.getVersion()) { + : Size(Data.getSize()) { uint32_t Flags = Data.getFlags(); #define ROOT_ELEMENT_FLAG(Num, Val, Str) \ Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; @@ -210,7 +210,6 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureDesc &S) { IO.mapRequired("Size", S.Size); - IO.mapRequired("Version", S.Version); #define ROOT_ELEMENT_FLAG(Num, Val, Str) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index 6c0ccda2e4ca5..6f10bd2f74b46 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -13,12 +13,10 @@ Parts: Size: 8 RootSignature: Size: 8 - Version: 1 AllowInputAssemblerInputLayout: true #CHECK: - Name: RTS0 #CHECK-NEXT: Size: 8 #CHECK-NEXT: RootSignature: #CHECK-NEXT: Size: 8 -#CHECK-NEXT: Version: 1 #CHECK-NEXT: AllowInputAssemblerInputLayout: true From f4af04372b1a13c36460350fcaa24dcece8d7e23 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 30 Jan 2025 22:53:12 +0000 Subject: [PATCH 141/220] fix pr --- llvm/include/llvm/BinaryFormat/DXContainer.h | 28 ------------------- .../include/llvm/ObjectYAML/DXContainerYAML.h | 2 -- 2 files changed, 30 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 942533d95ebea..3f7b50b82c7c8 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -63,40 +63,12 @@ struct ShaderHash { void swapBytes() { sys::swapByteOrder(Flags); } }; -#define ROOT_PARAMETER(RootParameter) RootParameter, -enum class RootParameterType { -#include "DXContainerConstants.def" -}; - -#define SHADER_VISIBILITY(ShaderVisibility) ShaderVisibility, -enum class ShaderVisibilityFlag { -#include "DXContainerConstants.def" -}; - -struct RootConstants { - uint32_t ShaderRegister; - uint32_t RegisterSpace; - uint32_t Num32BitValues; -}; - -struct RootParameter { - RootParameterType ParameterType; - union { - RootConstants Constants; - }; - ShaderVisibilityFlag ShaderVisibility; -}; - struct RootSignatureDesc { uint32_t Size; - uint32_t Version; uint32_t Flags; - uint32_t NumParameters; - RootParameter *Parameters; void swapBytes() { sys::swapByteOrder(Size); - sys::swapByteOrder(Version); sys::swapByteOrder(Flags); } }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 9b3259f3bf6c6..a82083fa18de6 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -80,8 +80,6 @@ struct RootSignatureDesc { uint32_t getEncodedFlags(); uint32_t Size; - uint32_t NumParameters; - SmallVector Parameters; #include "llvm/BinaryFormat/DXContainerConstants.def" }; From 496b922e02f967a49e2dc1da510d8ce863bdb220 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 31 Jan 2025 07:04:08 +0000 Subject: [PATCH 142/220] adding dxil-dis test --- .../DXContainer/RootSignature-Flags.yaml | 194 +++++++++++++++++- 1 file changed, 191 insertions(+), 3 deletions(-) diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index 6f10bd2f74b46..bcb04c2c9edd9 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -1,4 +1,6 @@ # RUN: yaml2obj %s | obj2yaml | FileCheck %s +# RUN: yaml2obj %s | dxil-dis | FileCheck %s --check-prefix=DXC + --- !dxcontainer Header: Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @@ -6,17 +8,203 @@ Header: Version: Major: 1 Minor: 0 - PartCount: 1 - PartOffsets: [ 60 ] + PartCount: 2 + PartOffsets: [ 60, 1496 ] Parts: + - Name: DXIL + Size: 1428 + Program: + MajorVersion: 6 + MinorVersion: 0 + ShaderKind: 5 + Size: 357 + DXILMajorVersion: 1 + DXILMinorVersion: 0 + DXILSize: 1404 + DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0x5C, + 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0, + 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23, + 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32, + 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19, + 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2, + 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14, + 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88, + 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5, + 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90, + 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C, + 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6, + 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x32, + 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13, 0x22, + 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84, 0xA1, + 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB, 0x84, + 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8, 0xC1, + 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40, 0x14, + 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54, 0x11, + 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30, 0x2, 0x50, + 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23, 0x0, 0x3, 0x0, + 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74, 0x60, 0x87, + 0x36, 0x68, 0x87, 0x79, 0x68, 0x3, 0x72, 0xC0, + 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D, 0xD0, 0xE, 0x7A, + 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A, 0x30, 0x7, 0x72, + 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x71, + 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x78, + 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9, 0x10, 0x7, 0x76, + 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D, 0x90, 0xE, 0x73, + 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72, 0xD0, 0x6, 0xE9, + 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76, 0x40, 0x7, 0x6D, + 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A, 0x10, 0x7, 0x76, + 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, + 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76, 0x40, 0x7, 0x7A, + 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE, 0x80, 0x7, 0x7A, + 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x7A, + 0x60, 0x7, 0x74, 0x30, 0xE4, 0x21, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xB, + 0x4, 0x6, 0x0, 0x0, 0x0, 0x32, 0x1E, 0x98, 0xC, + 0x19, 0x11, 0x4C, 0x90, 0x8C, 0x9, 0x26, 0x47, + 0xC6, 0x4, 0x43, 0xBA, 0x12, 0x28, 0x86, 0x11, + 0x80, 0x42, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xCB, + 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1, + 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38, + 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78, + 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED, + 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E, + 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30, + 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B, + 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3, + 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B, + 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A, + 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87, + 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1, + 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0, + 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21, + 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66, + 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43, + 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84, + 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7, + 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68, + 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70, + 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76, + 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87, + 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, + 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE, + 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8, + 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C, + 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21, + 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6, + 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43, + 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94, + 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F, + 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3, + 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70, + 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7, + 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0, + 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2, + 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1, + 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28, + 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2, + 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C, + 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18, + 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B, + 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68, + 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78, + 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50, + 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C, + 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1, + 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0, + 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3, + 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC, + 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B, + 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98, + 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A, + 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10, + 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30, + 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17, + 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48, + 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B, + 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3, + 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4, + 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76, + 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87, + 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED, + 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5, + 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3, + 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9, + 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23, + 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8, + 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D, + 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21, + 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D, + 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58, + 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A, + 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87, + 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1, + 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5, + 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18, + 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74, + 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81, + 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43, + 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0, + 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE, + 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1, + 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5, + 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73, + 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87, + 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3, + 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC, + 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48, + 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D, + 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C, + 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0, + 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40, + 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0, + 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68, + 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8, + 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38, + 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83, + 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94, + 0x3, 0x0, 0x0, 0x0, 0x0, 0x79, 0x28, 0x0, 0x0, + 0x2A, 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, + 0x10, 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, + 0x2, 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, + 0xC6, 0x0, 0x13, 0x74, 0x12, 0xA9, 0xB7, 0x37, + 0x3A, 0x23, 0xB6, 0xB0, 0xB3, 0xB9, 0x23, 0x8C, + 0xCD, 0x1D, 0xA2, 0x2D, 0x2C, 0xCD, 0x6D, 0x8, + 0x42, 0x1, 0xC, 0x41, 0x38, 0x82, 0x21, 0x8, 0x87, + 0x30, 0x4, 0xE1, 0x18, 0x86, 0x20, 0x1C, 0xC4, + 0x18, 0x84, 0xA0, 0x18, 0x43, 0x90, 0x8C, 0x41, + 0x20, 0x94, 0x31, 0xC, 0x82, 0x71, 0x8C, 0x41, + 0x28, 0x8E, 0x31, 0xC, 0x45, 0x51, 0x8C, 0x41, + 0x40, 0x9C, 0x31, 0x14, 0xC4, 0x0, 0x0, 0x8F, + 0x89, 0xC8, 0xF0, 0x5C, 0xE4, 0xDE, 0xDE, 0xE8, + 0xE6, 0xD2, 0xCE, 0xDC, 0xC2, 0xE8, 0xEA, 0xE4, + 0xCA, 0xE6, 0x86, 0x12, 0x28, 0xC6, 0x21, 0xC3, + 0x73, 0x99, 0x43, 0xB, 0x23, 0x2B, 0x93, 0x6B, + 0x7A, 0x23, 0x2B, 0x63, 0x1B, 0x4A, 0xB0, 0x18, + 0x85, 0xC, 0xCF, 0xC5, 0xAE, 0x4C, 0x6E, 0x2E, + 0xED, 0xCD, 0x6D, 0x28, 0x1, 0x63, 0x1C, 0x32, + 0x3C, 0x97, 0x32, 0x37, 0x3A, 0xB9, 0x3C, 0xA8, + 0xB7, 0x34, 0x37, 0xBA, 0xB9, 0xA1, 0x4, 0xF, + 0x0, 0x0, 0x71, 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, + 0x0, 0x6, 0x40, 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, + 0x61, 0x20, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, + 0x4, 0x1, 0x86, 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, + 0x0, 0x0, 0x7, 0x50, 0x10, 0xCD, 0x14, 0x61, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] - Name: RTS0 Size: 8 RootSignature: Size: 8 AllowInputAssemblerInputLayout: true - + + + #CHECK: - Name: RTS0 #CHECK-NEXT: Size: 8 #CHECK-NEXT: RootSignature: #CHECK-NEXT: Size: 8 #CHECK-NEXT: AllowInputAssemblerInputLayout: true + +# DXC: !dx.rootsignatures = !{[[RS:![0-9]+]]} +# DXC: [[RS]] = !{void ()* @main, [[REL:![0-9]+]]} +# DXC: [[REL]] = !{[[RF:![0-9]+]]} +# DXC: [[RF]] = !{!"RootFlags", i32 1} From 422578ffcf447cfa6506d30539dc3d1caea0c586 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 31 Jan 2025 22:53:06 +0000 Subject: [PATCH 143/220] adding compatibility test --- .../DXContainer/RootSignature-Flags.yaml | 191 +---------------- llvm/test/tools/dxil-dis/lit.local.cfg | 2 +- llvm/test/tools/dxil-dis/root-signature.yaml | 201 ++++++++++++++++++ 3 files changed, 204 insertions(+), 190 deletions(-) create mode 100644 llvm/test/tools/dxil-dis/root-signature.yaml diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index bcb04c2c9edd9..e3ca7347d52c7 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -1,5 +1,4 @@ # RUN: yaml2obj %s | obj2yaml | FileCheck %s -# RUN: yaml2obj %s | dxil-dis | FileCheck %s --check-prefix=DXC --- !dxcontainer Header: @@ -8,203 +7,17 @@ Header: Version: Major: 1 Minor: 0 - PartCount: 2 - PartOffsets: [ 60, 1496 ] + PartCount: 1 + PartOffsets: [ 60 ] Parts: - - Name: DXIL - Size: 1428 - Program: - MajorVersion: 6 - MinorVersion: 0 - ShaderKind: 5 - Size: 357 - DXILMajorVersion: 1 - DXILMinorVersion: 0 - DXILSize: 1404 - DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0x5C, - 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0, - 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23, - 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32, - 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19, - 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2, - 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14, - 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88, - 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5, - 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90, - 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C, - 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6, - 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x32, - 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13, 0x22, - 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84, 0xA1, - 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB, 0x84, - 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8, 0xC1, - 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40, 0x14, - 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54, 0x11, - 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30, 0x2, 0x50, - 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23, 0x0, 0x3, 0x0, - 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74, 0x60, 0x87, - 0x36, 0x68, 0x87, 0x79, 0x68, 0x3, 0x72, 0xC0, - 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D, 0xD0, 0xE, 0x7A, - 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A, 0x30, 0x7, 0x72, - 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x71, - 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x78, - 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9, 0x10, 0x7, 0x76, - 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D, 0x90, 0xE, 0x73, - 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72, 0xD0, 0x6, 0xE9, - 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76, 0x40, 0x7, 0x6D, - 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A, 0x10, 0x7, 0x76, - 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, - 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76, 0x40, 0x7, 0x7A, - 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE, 0x80, 0x7, 0x7A, - 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x7A, - 0x60, 0x7, 0x74, 0x30, 0xE4, 0x21, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xB, - 0x4, 0x6, 0x0, 0x0, 0x0, 0x32, 0x1E, 0x98, 0xC, - 0x19, 0x11, 0x4C, 0x90, 0x8C, 0x9, 0x26, 0x47, - 0xC6, 0x4, 0x43, 0xBA, 0x12, 0x28, 0x86, 0x11, - 0x80, 0x42, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xCB, - 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1, - 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38, - 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78, - 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED, - 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E, - 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30, - 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B, - 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3, - 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B, - 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A, - 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87, - 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1, - 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0, - 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21, - 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66, - 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43, - 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84, - 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7, - 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68, - 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70, - 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76, - 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87, - 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, - 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE, - 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8, - 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C, - 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21, - 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6, - 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43, - 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94, - 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F, - 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3, - 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70, - 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7, - 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0, - 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2, - 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1, - 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28, - 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2, - 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C, - 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18, - 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B, - 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68, - 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78, - 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50, - 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C, - 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1, - 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0, - 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3, - 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC, - 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B, - 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98, - 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A, - 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10, - 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30, - 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17, - 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48, - 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B, - 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3, - 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4, - 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76, - 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87, - 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED, - 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5, - 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3, - 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9, - 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23, - 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8, - 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D, - 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21, - 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D, - 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58, - 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A, - 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87, - 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1, - 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5, - 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18, - 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74, - 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81, - 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43, - 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0, - 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE, - 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1, - 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5, - 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73, - 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87, - 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3, - 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC, - 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48, - 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D, - 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C, - 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0, - 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40, - 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0, - 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68, - 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8, - 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38, - 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83, - 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94, - 0x3, 0x0, 0x0, 0x0, 0x0, 0x79, 0x28, 0x0, 0x0, - 0x2A, 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, - 0x10, 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, - 0x2, 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, - 0xC6, 0x0, 0x13, 0x74, 0x12, 0xA9, 0xB7, 0x37, - 0x3A, 0x23, 0xB6, 0xB0, 0xB3, 0xB9, 0x23, 0x8C, - 0xCD, 0x1D, 0xA2, 0x2D, 0x2C, 0xCD, 0x6D, 0x8, - 0x42, 0x1, 0xC, 0x41, 0x38, 0x82, 0x21, 0x8, 0x87, - 0x30, 0x4, 0xE1, 0x18, 0x86, 0x20, 0x1C, 0xC4, - 0x18, 0x84, 0xA0, 0x18, 0x43, 0x90, 0x8C, 0x41, - 0x20, 0x94, 0x31, 0xC, 0x82, 0x71, 0x8C, 0x41, - 0x28, 0x8E, 0x31, 0xC, 0x45, 0x51, 0x8C, 0x41, - 0x40, 0x9C, 0x31, 0x14, 0xC4, 0x0, 0x0, 0x8F, - 0x89, 0xC8, 0xF0, 0x5C, 0xE4, 0xDE, 0xDE, 0xE8, - 0xE6, 0xD2, 0xCE, 0xDC, 0xC2, 0xE8, 0xEA, 0xE4, - 0xCA, 0xE6, 0x86, 0x12, 0x28, 0xC6, 0x21, 0xC3, - 0x73, 0x99, 0x43, 0xB, 0x23, 0x2B, 0x93, 0x6B, - 0x7A, 0x23, 0x2B, 0x63, 0x1B, 0x4A, 0xB0, 0x18, - 0x85, 0xC, 0xCF, 0xC5, 0xAE, 0x4C, 0x6E, 0x2E, - 0xED, 0xCD, 0x6D, 0x28, 0x1, 0x63, 0x1C, 0x32, - 0x3C, 0x97, 0x32, 0x37, 0x3A, 0xB9, 0x3C, 0xA8, - 0xB7, 0x34, 0x37, 0xBA, 0xB9, 0xA1, 0x4, 0xF, - 0x0, 0x0, 0x71, 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, - 0x0, 0x6, 0x40, 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, - 0x61, 0x20, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, - 0x4, 0x1, 0x86, 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, - 0x0, 0x0, 0x7, 0x50, 0x10, 0xCD, 0x14, 0x61, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] - Name: RTS0 Size: 8 RootSignature: Size: 8 AllowInputAssemblerInputLayout: true - - #CHECK: - Name: RTS0 #CHECK-NEXT: Size: 8 #CHECK-NEXT: RootSignature: #CHECK-NEXT: Size: 8 #CHECK-NEXT: AllowInputAssemblerInputLayout: true - -# DXC: !dx.rootsignatures = !{[[RS:![0-9]+]]} -# DXC: [[RS]] = !{void ()* @main, [[REL:![0-9]+]]} -# DXC: [[REL]] = !{[[RF:![0-9]+]]} -# DXC: [[RF]] = !{!"RootFlags", i32 1} diff --git a/llvm/test/tools/dxil-dis/lit.local.cfg b/llvm/test/tools/dxil-dis/lit.local.cfg index 7b6819e0b406a..8fe45f696bff9 100644 --- a/llvm/test/tools/dxil-dis/lit.local.cfg +++ b/llvm/test/tools/dxil-dis/lit.local.cfg @@ -1,3 +1,3 @@ if not config.dxil_tests: config.unsupported = True -config.suffixes = [".ll"] +config.suffixes = [".ll", ".yaml"] diff --git a/llvm/test/tools/dxil-dis/root-signature.yaml b/llvm/test/tools/dxil-dis/root-signature.yaml new file mode 100644 index 0000000000000..2a11dd9b3fcee --- /dev/null +++ b/llvm/test/tools/dxil-dis/root-signature.yaml @@ -0,0 +1,201 @@ +# RUN: yaml2obj %s | dxil-dis | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 2 + PartOffsets: [ 60, 1496 ] +Parts: + - Name: DXIL + Size: 1428 + Program: + MajorVersion: 6 + MinorVersion: 0 + ShaderKind: 5 + Size: 357 + DXILMajorVersion: 1 + DXILMinorVersion: 0 + DXILSize: 1404 + DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0x5C, + 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0, + 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23, + 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32, + 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19, + 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2, + 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14, + 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88, + 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5, + 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90, + 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C, + 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6, + 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x32, + 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13, 0x22, + 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84, 0xA1, + 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB, 0x84, + 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8, 0xC1, + 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40, 0x14, + 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54, 0x11, + 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30, 0x2, 0x50, + 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23, 0x0, 0x3, 0x0, + 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74, 0x60, 0x87, + 0x36, 0x68, 0x87, 0x79, 0x68, 0x3, 0x72, 0xC0, + 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D, 0xD0, 0xE, 0x7A, + 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A, 0x30, 0x7, 0x72, + 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x71, + 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x78, + 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9, 0x10, 0x7, 0x76, + 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D, 0x90, 0xE, 0x73, + 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72, 0xD0, 0x6, 0xE9, + 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76, 0x40, 0x7, 0x6D, + 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A, 0x10, 0x7, 0x76, + 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, + 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76, 0x40, 0x7, 0x7A, + 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE, 0x80, 0x7, 0x7A, + 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x7A, + 0x60, 0x7, 0x74, 0x30, 0xE4, 0x21, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xB, + 0x4, 0x6, 0x0, 0x0, 0x0, 0x32, 0x1E, 0x98, 0xC, + 0x19, 0x11, 0x4C, 0x90, 0x8C, 0x9, 0x26, 0x47, + 0xC6, 0x4, 0x43, 0xBA, 0x12, 0x28, 0x86, 0x11, + 0x80, 0x42, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xCB, + 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1, + 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38, + 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78, + 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED, + 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E, + 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30, + 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B, + 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3, + 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B, + 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A, + 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87, + 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1, + 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0, + 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21, + 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66, + 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43, + 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84, + 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7, + 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68, + 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70, + 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76, + 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87, + 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, + 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE, + 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8, + 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C, + 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21, + 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6, + 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43, + 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94, + 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F, + 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3, + 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70, + 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7, + 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0, + 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2, + 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1, + 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28, + 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2, + 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C, + 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18, + 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B, + 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68, + 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78, + 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50, + 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C, + 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1, + 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0, + 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3, + 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC, + 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B, + 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98, + 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A, + 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10, + 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30, + 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17, + 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48, + 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B, + 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3, + 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4, + 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76, + 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87, + 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED, + 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5, + 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3, + 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9, + 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23, + 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8, + 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D, + 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21, + 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D, + 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58, + 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A, + 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87, + 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1, + 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5, + 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18, + 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74, + 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81, + 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43, + 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0, + 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE, + 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1, + 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5, + 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73, + 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87, + 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3, + 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC, + 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48, + 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D, + 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C, + 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0, + 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40, + 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0, + 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68, + 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8, + 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38, + 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83, + 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94, + 0x3, 0x0, 0x0, 0x0, 0x0, 0x79, 0x28, 0x0, 0x0, + 0x2A, 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, + 0x10, 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, + 0x2, 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, + 0xC6, 0x0, 0x13, 0x74, 0x12, 0xA9, 0xB7, 0x37, + 0x3A, 0x23, 0xB6, 0xB0, 0xB3, 0xB9, 0x23, 0x8C, + 0xCD, 0x1D, 0xA2, 0x2D, 0x2C, 0xCD, 0x6D, 0x8, + 0x42, 0x1, 0xC, 0x41, 0x38, 0x82, 0x21, 0x8, 0x87, + 0x30, 0x4, 0xE1, 0x18, 0x86, 0x20, 0x1C, 0xC4, + 0x18, 0x84, 0xA0, 0x18, 0x43, 0x90, 0x8C, 0x41, + 0x20, 0x94, 0x31, 0xC, 0x82, 0x71, 0x8C, 0x41, + 0x28, 0x8E, 0x31, 0xC, 0x45, 0x51, 0x8C, 0x41, + 0x40, 0x9C, 0x31, 0x14, 0xC4, 0x0, 0x0, 0x8F, + 0x89, 0xC8, 0xF0, 0x5C, 0xE4, 0xDE, 0xDE, 0xE8, + 0xE6, 0xD2, 0xCE, 0xDC, 0xC2, 0xE8, 0xEA, 0xE4, + 0xCA, 0xE6, 0x86, 0x12, 0x28, 0xC6, 0x21, 0xC3, + 0x73, 0x99, 0x43, 0xB, 0x23, 0x2B, 0x93, 0x6B, + 0x7A, 0x23, 0x2B, 0x63, 0x1B, 0x4A, 0xB0, 0x18, + 0x85, 0xC, 0xCF, 0xC5, 0xAE, 0x4C, 0x6E, 0x2E, + 0xED, 0xCD, 0x6D, 0x28, 0x1, 0x63, 0x1C, 0x32, + 0x3C, 0x97, 0x32, 0x37, 0x3A, 0xB9, 0x3C, 0xA8, + 0xB7, 0x34, 0x37, 0xBA, 0xB9, 0xA1, 0x4, 0xF, + 0x0, 0x0, 0x71, 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, + 0x0, 0x6, 0x40, 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, + 0x61, 0x20, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, + 0x4, 0x1, 0x86, 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, + 0x0, 0x0, 0x7, 0x50, 0x10, 0xCD, 0x14, 0x61, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + - Name: RTS0 + Size: 8 + RootSignature: + Size: 8 + AllowInputAssemblerInputLayout: true + +# CHECK: !dx.rootsignatures = !{[[RS:![0-9]+]]} +# CHECK: [[RS]] = !{void ()* @main, [[REL:![0-9]+]]} +# CHECK: [[REL]] = !{[[RF:![0-9]+]]} +# CHECK: [[RF]] = !{!"RootFlags", i32 1} From b1423eb7b69a26aaae5145876a465b2f0d9fbe79 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 3 Feb 2025 21:05:40 +0000 Subject: [PATCH 144/220] addressing test concerns --- .../llvm/MC/DXContainerRootSignature.h | 8 +- llvm/include/llvm/Object/DXContainer.h | 14 +- .../include/llvm/ObjectYAML/DXContainerYAML.h | 6 +- llvm/lib/MC/DXContainerRootSignature.cpp | 9 +- llvm/lib/Object/DXContainer.cpp | 18 +- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 11 +- .../DXContainer/RootSignature-Flags.yaml | 22 +- llvm/test/tools/dxil-dis/root-signature.yaml | 201 ------------------ llvm/unittests/Object/DXContainerTest.cpp | 83 ++------ .../ObjectYAML/DXContainerYAMLTest.cpp | 5 +- 10 files changed, 88 insertions(+), 289 deletions(-) delete mode 100644 llvm/test/tools/dxil-dis/root-signature.yaml diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 20b4f5a4285f6..e1a9be5fc52d8 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,9 +15,13 @@ class raw_ostream; namespace mcdxbc { struct RootSignatureHeader { - uint32_t Flags; + uint32_t Version = 2; + uint32_t NumParameters = 0; + uint32_t RootParametersOffset = 0; + uint32_t NumStaticSamplers = 0; + uint32_t StaticSamplersOffset = 0; + uint32_t Flags = 0; - void swapBytes(); void write(raw_ostream &OS); }; } // namespace mcdxbc diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 5f7737d2fa41d..47128f94e0968 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -120,16 +120,22 @@ namespace DirectX { class RootSignature { private: StringRef Data; - uint32_t Size; + uint32_t Version; + uint32_t NumParameters; + uint32_t RootParametersOffset; + uint32_t NumStaticSamplers; + uint32_t StaticSamplersOffset; uint32_t Flags; public: RootSignature(StringRef Data) : Data(Data) {} Error parse(); - - uint32_t getSize() const { return Size; } - + uint32_t getVersion() const { return Version; } + uint32_t getNumParameters() const { return NumParameters; } + uint32_t getRootParametersOffset() const { return RootParametersOffset; } + uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } + uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } uint32_t getFlags() const { return Flags; } }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index a82083fa18de6..1f967114ea1eb 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -79,7 +79,11 @@ struct RootSignatureDesc { RootSignatureDesc(const object::DirectX::RootSignature &Data); uint32_t getEncodedFlags(); - uint32_t Size; + uint32_t Version; + uint32_t NumParameters; + uint32_t RootParametersOffset; + uint32_t NumStaticSamplers; + uint32_t StaticSamplersOffset; #include "llvm/BinaryFormat/DXContainerConstants.def" }; diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 4e085654a1e5e..000d23f24d241 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -8,15 +8,16 @@ #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/SwapByteOrder.h" -#include using namespace llvm; using namespace llvm::mcdxbc; void RootSignatureHeader::write(raw_ostream &OS) { - uint32_t SizeInfo = sizeof(this); - support::endian::write(OS, SizeInfo, llvm::endianness::little); + support::endian::write(OS, Version, llvm::endianness::little); + support::endian::write(OS, NumParameters, llvm::endianness::little); + support::endian::write(OS, RootParametersOffset, llvm::endianness::little); + support::endian::write(OS, NumStaticSamplers, llvm::endianness::little); + support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); support::endian::write(OS, Flags, llvm::endianness::little); } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index a6b5346601a3d..2730ac43ca4ea 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -249,7 +249,23 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { Error DirectX::RootSignature::parse() { const char *Current = Data.begin(); - Size = support::endian::read(Current); + Version = support::endian::read(Current); + Current += sizeof(uint32_t); + + NumParameters = + support::endian::read(Current); + Current += sizeof(uint32_t); + + RootParametersOffset = + support::endian::read(Current); + Current += sizeof(uint32_t); + + NumStaticSamplers = + support::endian::read(Current); + Current += sizeof(uint32_t); + + StaticSamplersOffset = + support::endian::read(Current); Current += sizeof(uint32_t); Flags = support::endian::read(Current); diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index fd85d75dc32eb..522781c0d36ef 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -32,7 +32,10 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureDesc::RootSignatureDesc( const object::DirectX::RootSignature &Data) - : Size(Data.getSize()) { + : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), + RootParametersOffset(Data.getRootParametersOffset()), + NumStaticSamplers(Data.getNumStaticSamplers()), + StaticSamplersOffset(Data.getStaticSamplersOffset()) { uint32_t Flags = Data.getFlags(); #define ROOT_ELEMENT_FLAG(Num, Val, Str) \ Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; @@ -209,7 +212,11 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureDesc &S) { - IO.mapRequired("Size", S.Size); + IO.mapRequired("Version", S.Version); + IO.mapRequired("NumParameters", S.NumParameters); + IO.mapRequired("RootParametersOffset", S.RootParametersOffset); + IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); + IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); #define ROOT_ELEMENT_FLAG(Num, Val, Str) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index e3ca7347d52c7..06814f660f283 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -11,13 +11,21 @@ Header: PartOffsets: [ 60 ] Parts: - Name: RTS0 - Size: 8 + Size: 24 RootSignature: - Size: 8 + Version: 2 + NumParameters: 0 + RootParametersOffset: 0 + NumStaticSamplers: 0 + StaticSamplersOffset: 0 AllowInputAssemblerInputLayout: true -#CHECK: - Name: RTS0 -#CHECK-NEXT: Size: 8 -#CHECK-NEXT: RootSignature: -#CHECK-NEXT: Size: 8 -#CHECK-NEXT: AllowInputAssemblerInputLayout: true +# CHECK: - Name: RTS0 +# CHECK-NEXT: Size: 24 +# CHECK-NEXT: RootSignature: +# CHECK-NEXT: Version: 2 +# CHECK-NEXT: NumParameters: 0 +# CHECK-NEXT: RootParametersOffset: 0 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 0 +# CHECK-NEXT: AllowInputAssemblerInputLayout: true diff --git a/llvm/test/tools/dxil-dis/root-signature.yaml b/llvm/test/tools/dxil-dis/root-signature.yaml deleted file mode 100644 index 2a11dd9b3fcee..0000000000000 --- a/llvm/test/tools/dxil-dis/root-signature.yaml +++ /dev/null @@ -1,201 +0,0 @@ -# RUN: yaml2obj %s | dxil-dis | FileCheck %s - ---- !dxcontainer -Header: - Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] - Version: - Major: 1 - Minor: 0 - PartCount: 2 - PartOffsets: [ 60, 1496 ] -Parts: - - Name: DXIL - Size: 1428 - Program: - MajorVersion: 6 - MinorVersion: 0 - ShaderKind: 5 - Size: 357 - DXILMajorVersion: 1 - DXILMinorVersion: 0 - DXILSize: 1404 - DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0x5C, - 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0, - 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23, - 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32, - 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19, - 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2, - 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14, - 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88, - 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5, - 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90, - 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C, - 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6, - 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x32, - 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13, 0x22, - 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84, 0xA1, - 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB, 0x84, - 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8, 0xC1, - 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40, 0x14, - 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54, 0x11, - 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30, 0x2, 0x50, - 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23, 0x0, 0x3, 0x0, - 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74, 0x60, 0x87, - 0x36, 0x68, 0x87, 0x79, 0x68, 0x3, 0x72, 0xC0, - 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D, 0xD0, 0xE, 0x7A, - 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A, 0x30, 0x7, 0x72, - 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x71, - 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x90, 0xE, 0x78, - 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9, 0x10, 0x7, 0x76, - 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D, 0x90, 0xE, 0x73, - 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72, 0xD0, 0x6, 0xE9, - 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76, 0x40, 0x7, 0x6D, - 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A, 0x10, 0x7, 0x76, - 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, - 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76, 0x40, 0x7, 0x7A, - 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE, 0x80, 0x7, 0x7A, - 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x7A, - 0x60, 0x7, 0x74, 0x30, 0xE4, 0x21, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0xB, - 0x4, 0x6, 0x0, 0x0, 0x0, 0x32, 0x1E, 0x98, 0xC, - 0x19, 0x11, 0x4C, 0x90, 0x8C, 0x9, 0x26, 0x47, - 0xC6, 0x4, 0x43, 0xBA, 0x12, 0x28, 0x86, 0x11, - 0x80, 0x42, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xCB, - 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1, - 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38, - 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78, - 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED, - 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E, - 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30, - 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B, - 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3, - 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B, - 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A, - 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87, - 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1, - 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0, - 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21, - 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66, - 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43, - 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84, - 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7, - 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68, - 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70, - 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76, - 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87, - 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98, - 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE, - 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8, - 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C, - 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21, - 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6, - 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43, - 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94, - 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F, - 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3, - 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70, - 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7, - 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0, - 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2, - 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1, - 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28, - 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2, - 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C, - 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18, - 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B, - 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68, - 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78, - 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50, - 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C, - 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1, - 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0, - 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3, - 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC, - 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B, - 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98, - 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A, - 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10, - 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30, - 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17, - 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48, - 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B, - 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3, - 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4, - 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76, - 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87, - 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED, - 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5, - 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3, - 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9, - 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23, - 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8, - 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D, - 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21, - 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D, - 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58, - 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A, - 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87, - 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1, - 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5, - 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18, - 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74, - 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81, - 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43, - 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0, - 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE, - 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1, - 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5, - 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73, - 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87, - 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3, - 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC, - 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48, - 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D, - 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C, - 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0, - 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40, - 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0, - 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68, - 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8, - 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38, - 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83, - 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94, - 0x3, 0x0, 0x0, 0x0, 0x0, 0x79, 0x28, 0x0, 0x0, - 0x2A, 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, - 0x10, 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, - 0x2, 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, - 0xC6, 0x0, 0x13, 0x74, 0x12, 0xA9, 0xB7, 0x37, - 0x3A, 0x23, 0xB6, 0xB0, 0xB3, 0xB9, 0x23, 0x8C, - 0xCD, 0x1D, 0xA2, 0x2D, 0x2C, 0xCD, 0x6D, 0x8, - 0x42, 0x1, 0xC, 0x41, 0x38, 0x82, 0x21, 0x8, 0x87, - 0x30, 0x4, 0xE1, 0x18, 0x86, 0x20, 0x1C, 0xC4, - 0x18, 0x84, 0xA0, 0x18, 0x43, 0x90, 0x8C, 0x41, - 0x20, 0x94, 0x31, 0xC, 0x82, 0x71, 0x8C, 0x41, - 0x28, 0x8E, 0x31, 0xC, 0x45, 0x51, 0x8C, 0x41, - 0x40, 0x9C, 0x31, 0x14, 0xC4, 0x0, 0x0, 0x8F, - 0x89, 0xC8, 0xF0, 0x5C, 0xE4, 0xDE, 0xDE, 0xE8, - 0xE6, 0xD2, 0xCE, 0xDC, 0xC2, 0xE8, 0xEA, 0xE4, - 0xCA, 0xE6, 0x86, 0x12, 0x28, 0xC6, 0x21, 0xC3, - 0x73, 0x99, 0x43, 0xB, 0x23, 0x2B, 0x93, 0x6B, - 0x7A, 0x23, 0x2B, 0x63, 0x1B, 0x4A, 0xB0, 0x18, - 0x85, 0xC, 0xCF, 0xC5, 0xAE, 0x4C, 0x6E, 0x2E, - 0xED, 0xCD, 0x6D, 0x28, 0x1, 0x63, 0x1C, 0x32, - 0x3C, 0x97, 0x32, 0x37, 0x3A, 0xB9, 0x3C, 0xA8, - 0xB7, 0x34, 0x37, 0xBA, 0xB9, 0xA1, 0x4, 0xF, - 0x0, 0x0, 0x71, 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, - 0x0, 0x6, 0x40, 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, - 0x61, 0x20, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, - 0x4, 0x1, 0x86, 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, - 0x0, 0x0, 0x7, 0x50, 0x10, 0xCD, 0x14, 0x61, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] - - Name: RTS0 - Size: 8 - RootSignature: - Size: 8 - AllowInputAssemblerInputLayout: true - -# CHECK: !dx.rootsignatures = !{[[RS:![0-9]+]]} -# CHECK: [[RS]] = !{void ()* @main, [[REL:![0-9]+]]} -# CHECK: [[REL]] = !{[[RF:![0-9]+]]} -# CHECK: [[RF]] = !{!"RootFlags", i32 1} diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 5a73f32ab7c32..f80828f06bdd2 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -823,70 +823,23 @@ TEST(DXCFile, MalformedSignature) { } TEST(RootSignature, ParseRootFlags) { - { - uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - }; - DXContainer C = - llvm::cantFail(DXContainer::create(getMemoryBuffer<68>(Buffer))); - - const auto &RS = C.getRootSignature(); - ASSERT_TRUE(RS.has_value()); - ASSERT_EQ(RS->getVersion(), 2u); - ASSERT_EQ(RS->getNumParameters(), 0u); - ASSERT_EQ(RS->getRootParametersOffset(), 0u); - ASSERT_EQ(RS->getNumStaticSamplers(), 0u); - ASSERT_EQ(RS->getStaticSamplersOffset(), 0u); - ASSERT_EQ(RS->getFlags(), 0x01u); - } + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<180>(Buffer))); - { - // this parameter has the root signature definition missing some values. - uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, - 0x6F, 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, - 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, - 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<64>(Buffer)), - FailedWithMessage( - "Invalid root signature, insufficient space for header.")); - } - { - // Version has been changed to an invalid number. - uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<100>(Buffer)), - FailedWithMessage("unsupported root signature version read: 3")); - } - { - // Flag has been set to an invalid value - uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, - }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<100>(Buffer)), - FailedWithMessage( - "unsupported root signature flag value read: 4278190081")); - } + const auto &RS = C.getRootSignature(); + ASSERT_TRUE(RS.has_value()); + ASSERT_EQ(RS->getVersion(), 2); + ASSERT_EQ(RS->getNumParameters(), 0); + ASSERT_EQ(RS->getRootParametersOffset(), 0); + ASSERT_EQ(RS->getNumStaticSamplers(), 0); + ASSERT_EQ(RS->getStaticSamplersOffset(), 0); + ASSERT_EQ(RS->getFlags(), 0x01); } diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b48cd9ce53987..b18075bac96c7 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -15,6 +15,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" +#include using namespace llvm; using namespace llvm::object; @@ -128,9 +129,9 @@ TEST(RootSignature, ParseRootFlags) { RootSignature: Version: 2 NumParameters: 0 - RootParametersOffset: 0 + RootParametersOffset: 24 NumStaticSamplers: 0 - StaticSamplersOffset: 0 + StaticSamplersOffset: 24 AllowInputAssemblerInputLayout: true )")); From b6262b675d0150e4fc9f33d324b92935ffd4d4a1 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 3 Feb 2025 21:11:48 +0000 Subject: [PATCH 145/220] clean up --- llvm/test/tools/dxil-dis/lit.local.cfg | 2 +- llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/test/tools/dxil-dis/lit.local.cfg b/llvm/test/tools/dxil-dis/lit.local.cfg index 8fe45f696bff9..7b6819e0b406a 100644 --- a/llvm/test/tools/dxil-dis/lit.local.cfg +++ b/llvm/test/tools/dxil-dis/lit.local.cfg @@ -1,3 +1,3 @@ if not config.dxil_tests: config.unsupported = True -config.suffixes = [".ll", ".yaml"] +config.suffixes = [".ll"] diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b18075bac96c7..eaa8049e8e7d9 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -15,7 +15,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" -#include using namespace llvm; using namespace llvm::object; From 16d0e8ef24d3794270e5107fdd11335b67868497 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 07:20:03 +0000 Subject: [PATCH 146/220] addressing comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 2 +- .../BinaryFormat/DXContainerConstants.def | 48 +++++-------------- .../include/llvm/ObjectYAML/DXContainerYAML.h | 5 +- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 5 ++ llvm/lib/ObjectYAML/DXContainerYAML.cpp | 6 +-- .../DXContainer/RootSignature-Flags.yaml | 18 +++---- .../ObjectYAML/DXContainerYAMLTest.cpp | 4 +- 7 files changed, 36 insertions(+), 52 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 3f7b50b82c7c8..2b7c79452bc18 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -162,7 +162,7 @@ enum class FeatureFlags : uint64_t { static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63, "Shader flag bits exceed enum size."); -#define ROOT_ELEMENT_FLAG(Num, Val, Str) Val = 1ull << Num, +#define ROOT_ELEMENT_FLAG(Num, Val) Val = 1ull << Num, enum class RootElementFlag : uint32_t { #include "DXContainerConstants.def" }; diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 96d84fdc3faa3..e27f442c22e53 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -53,44 +53,20 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #undef SHADER_FEATURE_FLAG #endif // SHADER_FEATURE_FLAG -#ifdef ROOT_PARAMETER - -ROOT_PARAMETER(DescriptorTable) -ROOT_PARAMETER(Constants32Bit) -ROOT_PARAMETER(CBV) -ROOT_PARAMETER(SRV) -ROOT_PARAMETER(UAV) -#undef ROOT_PARAMETER -#endif // ROOT_PARAMETER - - -#ifdef SHADER_VISIBILITY - -SHADER_VISIBILITY(All) -SHADER_VISIBILITY(Vertex) -SHADER_VISIBILITY(Hull) -SHADER_VISIBILITY(Domain) -SHADER_VISIBILITY(Geometry) -SHADER_VISIBILITY(Pixel) -SHADER_VISIBILITY(Amplification) -SHADER_VISIBILITY(Mesh) -#undef SHADER_VISIBILITY -#endif // SHADER_VISIBILITY - #ifdef ROOT_ELEMENT_FLAG -ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout, "The app is opting in to using the Input Assembler") -ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess, "Denies the vertex shader access to the root signature.") -ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess, "Denies the hull shader access to the root signature.") -ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess, "Denies the domain shader access to the root signature.") -ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess, "Denies the geometry shader access to the root signature.") -ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess, "Denies the pixel shader access to the root signature.") -ROOT_ELEMENT_FLAG(6, AllowStreamOutput, "The app is opting in to using Stream Output.") -ROOT_ELEMENT_FLAG(7, LocalRootSignature, "The root signature is to be used with raytracing shaders to define resource bindings sourced from shader records in shader tables.") -ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess, "Denies the amplification shader access to the root signature.") -ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess, "Denies the mesh shader access to the root signature.") -ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed, "The shaders are allowed to index the CBV/SRV/UAV descriptor heap directly, using the ResourceDescriptorHeap built-in variable.") -ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed, "The shaders are allowed to index the sampler descriptor heap directly, using the SamplerDescriptorHeap built-in variable.") +ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout) +ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess) +ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess) +ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess) +ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess) +ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess) +ROOT_ELEMENT_FLAG(6, AllowStreamOutput) +ROOT_ELEMENT_FLAG(7, LocalRootSignature) +ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess) +ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess) +ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed) +ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) #undef ROOT_ELEMENT_FLAG #endif // ROOT_ELEMENT_FLAG diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 1f967114ea1eb..0200f5cb196ff 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -73,18 +73,19 @@ struct ShaderHash { std::vector Digest; }; -#define ROOT_ELEMENT_FLAG(Num, Val, Str) bool Val = false; +#define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; struct RootSignatureDesc { RootSignatureDesc() = default; RootSignatureDesc(const object::DirectX::RootSignature &Data); - uint32_t getEncodedFlags(); uint32_t Version; uint32_t NumParameters; uint32_t RootParametersOffset; uint32_t NumStaticSamplers; uint32_t StaticSamplersOffset; + uint32_t getEncodedFlags(); + #include "llvm/BinaryFormat/DXContainerConstants.def" }; diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index ada7383ea3c6b..b7d1c6558fa1f 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -268,6 +268,11 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { mcdxbc::RootSignatureHeader Header; Header.Flags = P.RootSignature->getEncodedFlags(); + Header.Version = P.RootSignature->Version; + Header.NumParameters = P.RootSignature->NumParameters; + Header.RootParametersOffset = P.RootSignature->RootParametersOffset; + Header.NumStaticSamplers = P.RootSignature->NumStaticSamplers; + Header.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; Header.write(OS); break; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 522781c0d36ef..fdf87b05d1f43 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -37,14 +37,14 @@ DXContainerYAML::RootSignatureDesc::RootSignatureDesc( NumStaticSamplers(Data.getNumStaticSamplers()), StaticSamplersOffset(Data.getStaticSamplersOffset()) { uint32_t Flags = Data.getFlags(); -#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ +#define ROOT_ELEMENT_FLAG(Num, Val) \ Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" } uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { uint64_t Flag = 0; -#define ROOT_ELEMENT_FLAG(Num, Val, Str) \ +#define ROOT_ELEMENT_FLAG(Num, Val) \ if (Val) \ Flag |= (uint32_t)dxbc::RootElementFlag::Val; #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -217,7 +217,7 @@ void MappingTraits::mapping( IO.mapRequired("RootParametersOffset", S.RootParametersOffset); IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); -#define ROOT_ELEMENT_FLAG(Num, Val, Str) IO.mapOptional(#Val, S.Val, false); +#define ROOT_ELEMENT_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index 06814f660f283..b0a3e6945f454 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -14,18 +14,20 @@ Parts: Size: 24 RootSignature: Version: 2 - NumParameters: 0 - RootParametersOffset: 0 - NumStaticSamplers: 0 - StaticSamplersOffset: 0 + NumParameters: 1 + RootParametersOffset: 3 + NumStaticSamplers: 4 + StaticSamplersOffset: 5 AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true # CHECK: - Name: RTS0 # CHECK-NEXT: Size: 24 # CHECK-NEXT: RootSignature: # CHECK-NEXT: Version: 2 -# CHECK-NEXT: NumParameters: 0 -# CHECK-NEXT: RootParametersOffset: 0 -# CHECK-NEXT: NumStaticSamplers: 0 -# CHECK-NEXT: StaticSamplersOffset: 0 +# CHECK-NEXT: NumParameters: 1 +# CHECK-NEXT: RootParametersOffset: 3 +# CHECK-NEXT: NumStaticSamplers: 4 +# CHECK-NEXT: StaticSamplersOffset: 5 # CHECK-NEXT: AllowInputAssemblerInputLayout: true +# CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index eaa8049e8e7d9..b48cd9ce53987 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -128,9 +128,9 @@ TEST(RootSignature, ParseRootFlags) { RootSignature: Version: 2 NumParameters: 0 - RootParametersOffset: 24 + RootParametersOffset: 0 NumStaticSamplers: 0 - StaticSamplersOffset: 24 + StaticSamplersOffset: 0 AllowInputAssemblerInputLayout: true )")); From 0a9e4689ae2c10ccbdbc36b6fd18d9a172204c4d Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 19:19:36 +0000 Subject: [PATCH 147/220] adding fail test --- llvm/include/llvm/Object/DXContainer.h | 5 +-- llvm/lib/Object/DXContainer.cpp | 12 ++++-- llvm/unittests/Object/DXContainerTest.cpp | 51 +++++++++++++++-------- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 47128f94e0968..c3a2f756bd683 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -119,7 +119,6 @@ namespace DirectX { class RootSignature { private: - StringRef Data; uint32_t Version; uint32_t NumParameters; uint32_t RootParametersOffset; @@ -128,9 +127,9 @@ class RootSignature { uint32_t Flags; public: - RootSignature(StringRef Data) : Data(Data) {} + RootSignature() {} - Error parse(); + Error parse(StringRef Data); uint32_t getVersion() const { return Version; } uint32_t getNumParameters() const { return NumParameters; } uint32_t getRootParametersOffset() const { return RootParametersOffset; } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 2730ac43ca4ea..dd0a465d7adb7 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/DXContainer.h" +#include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" @@ -100,8 +101,8 @@ Error DXContainer::parseHash(StringRef Part) { Error DXContainer::parseRootSignature(StringRef Part) { if (RootSignature) return parseFailed("More than one RTS0 part is present in the file"); - RootSignature = DirectX::RootSignature(Part); - if (Error Err = RootSignature->parse()) + RootSignature = DirectX::RootSignature(); + if (Error Err = RootSignature->parse(Part)) return Err; return Error::success(); } @@ -246,9 +247,14 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { IteratorState.Offset = Offset; } -Error DirectX::RootSignature::parse() { +Error DirectX::RootSignature::parse(StringRef Data) { const char *Current = Data.begin(); + // Root Signature headers expects 6 integers to be present. + if (Data.size() < 6 * sizeof(uint32_t)) { + return parseFailed("Invalid data. Too small."); + } + Version = support::endian::read(Current); Current += sizeof(uint32_t); diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index f80828f06bdd2..bff58c036489f 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -823,23 +823,38 @@ TEST(DXCFile, MalformedSignature) { } TEST(RootSignature, ParseRootFlags) { - uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - }; - DXContainer C = - llvm::cantFail(DXContainer::create(getMemoryBuffer<180>(Buffer))); + { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<180>(Buffer))); + + const auto &RS = C.getRootSignature(); + ASSERT_TRUE(RS.has_value()); + ASSERT_EQ(RS->getVersion(), 2); + ASSERT_EQ(RS->getNumParameters(), 0); + ASSERT_EQ(RS->getRootParametersOffset(), 0); + ASSERT_EQ(RS->getNumStaticSamplers(), 0); + ASSERT_EQ(RS->getStaticSamplersOffset(), 0); + ASSERT_EQ(RS->getFlags(), 0x01); + } - const auto &RS = C.getRootSignature(); - ASSERT_TRUE(RS.has_value()); - ASSERT_EQ(RS->getVersion(), 2); - ASSERT_EQ(RS->getNumParameters(), 0); - ASSERT_EQ(RS->getRootParametersOffset(), 0); - ASSERT_EQ(RS->getNumStaticSamplers(), 0); - ASSERT_EQ(RS->getStaticSamplersOffset(), 0); - ASSERT_EQ(RS->getFlags(), 0x01); + { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, + 0x6F, 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, + 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, + 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<44>(Buffer)), + FailedWithMessage("Invalid data. Too small.")); + } } From 82950318e00053b0367d721988e39e223573a1aa Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 19:26:29 +0000 Subject: [PATCH 148/220] adding comment --- llvm/include/llvm/BinaryFormat/DXContainerConstants.def | 2 ++ llvm/unittests/Object/DXContainerTest.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index e27f442c22e53..6d44ea14df444 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -53,6 +53,8 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #undef SHADER_FEATURE_FLAG #endif // SHADER_FEATURE_FLAG + +// ROOT_ELEMENT_FLAG(bit offset for the flag, name). #ifdef ROOT_ELEMENT_FLAG ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout) diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index bff58c036489f..8e20ae552c062 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -846,6 +846,7 @@ TEST(RootSignature, ParseRootFlags) { } { + // this parameter has the root signature definition missing some values. uint8_t Buffer[] = { 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, From c8e1e3889d2f6c61bc5cd532cc06f0f194fed783 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 23:24:59 +0000 Subject: [PATCH 149/220] adding few more tests --- llvm/include/llvm/BinaryFormat/DXContainer.h | 26 ++++++++--------- llvm/include/llvm/Object/DXContainer.h | 1 + llvm/lib/Object/DXContainer.cpp | 17 +++++++++-- llvm/unittests/Object/DXContainerTest.cpp | 30 ++++++++++++++++++-- 4 files changed, 57 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 2b7c79452bc18..4f48d0c41cf76 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -14,9 +14,12 @@ #define LLVM_BINARYFORMAT_DXCONTAINER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Object/Error.h" +#include "llvm/Support/Error.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" +#include #include namespace llvm { @@ -63,15 +66,6 @@ struct ShaderHash { void swapBytes() { sys::swapByteOrder(Flags); } }; -struct RootSignatureDesc { - uint32_t Size; - uint32_t Flags; - - void swapBytes() { - sys::swapByteOrder(Size); - sys::swapByteOrder(Flags); - } -}; struct ContainerVersion { uint16_t Major; @@ -558,11 +552,17 @@ static_assert(sizeof(ProgramSignatureElement) == 32, struct RootSignatureValidations { - static bool isValidRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } + static Expected validateRootFlag(uint32_t Flags) { + if ((Flags & ~0x80000fff) != 0) + return llvm::make_error("Invalid flag"); + return Flags; + } - static bool isValidVersion(uint32_t Version) { - return (Version == 1 || Version == 2); - } + static Expected validateVersion(uint32_t Version) { + if (Version < 1 || Version > 2) + return llvm::make_error("Invalid Version"); + return Version; + } }; } // namespace dxbc diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index c3a2f756bd683..e90c6866400e0 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Object/Error.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index dd0a465d7adb7..084b634662391 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -12,7 +12,9 @@ #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include using namespace llvm; using namespace llvm::object; @@ -250,14 +252,20 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { Error DirectX::RootSignature::parse(StringRef Data) { const char *Current = Data.begin(); + // Root Signature headers expects 6 integers to be present. if (Data.size() < 6 * sizeof(uint32_t)) { return parseFailed("Invalid data. Too small."); } - Version = support::endian::read(Current); + uint32_t VValue = support::endian::read(Current); Current += sizeof(uint32_t); + Expected MaybeVersion = dxbc::RootSignatureValidations::validateVersion(VValue); + if(Error E = MaybeVersion.takeError()) + return E; + Version = MaybeVersion.get(); + NumParameters = support::endian::read(Current); Current += sizeof(uint32_t); @@ -274,9 +282,14 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - Flags = support::endian::read(Current); + uint32_t FValue = support::endian::read(Current); Current += sizeof(uint32_t); + Expected MaybeFlag = dxbc::RootSignatureValidations::validateRootFlag(FValue); + if(Error E = MaybeFlag.takeError()) + return E; + Flags = MaybeFlag.get(); + return Error::success(); } diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 8e20ae552c062..1433d5e7f2f08 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -833,7 +833,7 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; DXContainer C = - llvm::cantFail(DXContainer::create(getMemoryBuffer<180>(Buffer))); + llvm::cantFail(DXContainer::create(getMemoryBuffer<68>(Buffer))); const auto &RS = C.getRootSignature(); ASSERT_TRUE(RS.has_value()); @@ -855,7 +855,33 @@ TEST(RootSignature, ParseRootFlags) { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<44>(Buffer)), + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<64>(Buffer)), FailedWithMessage("Invalid data. Too small.")); } + { + // Version has been changed to an invalid number. + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Invalid Version")); + } + { + // Flag has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, + }; + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Invalid flag")); + } } From 434b862b6c53983bc6f71af693c17d48512bbf4c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 23:29:02 +0000 Subject: [PATCH 150/220] format --- llvm/include/llvm/BinaryFormat/DXContainer.h | 21 ++++++++++---------- llvm/lib/Object/DXContainer.cpp | 17 +++++++++------- llvm/unittests/Object/DXContainerTest.cpp | 4 ++-- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 4f48d0c41cf76..0d5caabd3471e 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -66,7 +66,6 @@ struct ShaderHash { void swapBytes() { sys::swapByteOrder(Flags); } }; - struct ContainerVersion { uint16_t Major; uint16_t Minor; @@ -552,17 +551,17 @@ static_assert(sizeof(ProgramSignatureElement) == 32, struct RootSignatureValidations { - static Expected validateRootFlag(uint32_t Flags) { - if ((Flags & ~0x80000fff) != 0) - return llvm::make_error("Invalid flag"); - return Flags; - } + static Expected validateRootFlag(uint32_t Flags) { + if ((Flags & ~0x80000fff) != 0) + return llvm::make_error("Invalid flag"); + return Flags; + } - static Expected validateVersion(uint32_t Version) { - if (Version < 1 || Version > 2) - return llvm::make_error("Invalid Version"); - return Version; - } + static Expected validateVersion(uint32_t Version) { + if (Version < 1 || Version > 2) + return llvm::make_error("Invalid Version"); + return Version; + } }; } // namespace dxbc diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 084b634662391..05f3df640835c 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -252,17 +252,18 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { Error DirectX::RootSignature::parse(StringRef Data) { const char *Current = Data.begin(); - // Root Signature headers expects 6 integers to be present. if (Data.size() < 6 * sizeof(uint32_t)) { return parseFailed("Invalid data. Too small."); } - uint32_t VValue = support::endian::read(Current); + uint32_t VValue = + support::endian::read(Current); Current += sizeof(uint32_t); - Expected MaybeVersion = dxbc::RootSignatureValidations::validateVersion(VValue); - if(Error E = MaybeVersion.takeError()) + Expected MaybeVersion = + dxbc::RootSignatureValidations::validateVersion(VValue); + if (Error E = MaybeVersion.takeError()) return E; Version = MaybeVersion.get(); @@ -282,11 +283,13 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - uint32_t FValue = support::endian::read(Current); + uint32_t FValue = + support::endian::read(Current); Current += sizeof(uint32_t); - Expected MaybeFlag = dxbc::RootSignatureValidations::validateRootFlag(FValue); - if(Error E = MaybeFlag.takeError()) + Expected MaybeFlag = + dxbc::RootSignatureValidations::validateRootFlag(FValue); + if (Error E = MaybeFlag.takeError()) return E; Flags = MaybeFlag.get(); diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 1433d5e7f2f08..8489b05f8b331 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -868,7 +868,7 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), FailedWithMessage("Invalid Version")); } { @@ -881,7 +881,7 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), FailedWithMessage("Invalid flag")); } } From 2bfc5ad0cdef4197b9c05b5a5a8f964335bda7f6 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 5 Feb 2025 20:21:43 +0000 Subject: [PATCH 151/220] cleanup --- llvm/include/llvm/BinaryFormat/DXContainer.h | 7 +++---- llvm/include/llvm/Object/DXContainer.h | 1 - llvm/lib/Object/DXContainer.cpp | 3 --- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 1 - 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 0d5caabd3471e..c219aa819795e 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -14,12 +14,11 @@ #define LLVM_BINARYFORMAT_DXCONTAINER_H #include "llvm/ADT/StringRef.h" -#include "llvm/Object/Error.h" +#include "llvm/Support/BinaryStreamError.h" #include "llvm/Support/Error.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" -#include #include namespace llvm { @@ -553,13 +552,13 @@ struct RootSignatureValidations { static Expected validateRootFlag(uint32_t Flags) { if ((Flags & ~0x80000fff) != 0) - return llvm::make_error("Invalid flag"); + return llvm::make_error("Invalid flag"); return Flags; } static Expected validateVersion(uint32_t Version) { if (Version < 1 || Version > 2) - return llvm::make_error("Invalid Version"); + return llvm::make_error("Invalid Version"); return Version; } }; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index e90c6866400e0..c3a2f756bd683 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -18,7 +18,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" -#include "llvm/Object/Error.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 05f3df640835c..4b840bae86601 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -7,14 +7,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/DXContainer.h" -#include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" -#include using namespace llvm; using namespace llvm::object; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index fdf87b05d1f43..0869fd4fa9785 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -14,7 +14,6 @@ #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" -#include "llvm/Object/DXContainer.h" #include "llvm/Support/ScopedPrinter.h" namespace llvm { From 479422d3d5bb58f3a31462819771db35fbdae9aa Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 15 Jan 2025 17:30:00 +0000 Subject: [PATCH 152/220] adding metadata extraction --- .../llvm/Analysis/DXILMetadataAnalysis.h | 3 + llvm/lib/Analysis/DXILMetadataAnalysis.cpp | 89 +++++++++++++++++++ .../lib/Target/DirectX/DXContainerGlobals.cpp | 25 ++---- 3 files changed, 100 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h index cb535ac14f1c6..f420244ba111a 100644 --- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h +++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h @@ -11,9 +11,11 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/PassManager.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" +#include namespace llvm { @@ -37,6 +39,7 @@ struct ModuleMetadataInfo { Triple::EnvironmentType ShaderProfile{Triple::UnknownEnvironment}; VersionTuple ValidatorVersion{}; SmallVector EntryPropertyVec{}; + std::optional RootSignatureDesc; void print(raw_ostream &OS) const; }; diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp index a7f666a3f8b48..388e3853008ea 100644 --- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp +++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp @@ -15,12 +15,91 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" +#include "llvm/MC/DXContainerRootSignature.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" +#include #define DEBUG_TYPE "dxil-metadata-analysis" using namespace llvm; using namespace dxil; +using namespace llvm::mcdxbc; + +static bool parseRootFlags(MDNode *RootFlagNode, RootSignatureDesc *Desc) { + + assert(RootFlagNode->getNumOperands() == 2 && + "Invalid format for RootFlag Element"); + auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); + auto Value = (RootSignatureFlags)Flag->getZExtValue(); + + if ((Value & ~RootSignatureFlags::ValidFlags) != RootSignatureFlags::None) + return true; + + Desc->Flags = Value; + return false; +} + +static bool parseRootSignatureElement(MDNode *Element, + RootSignatureDesc *Desc) { + MDString *ElementText = cast(Element->getOperand(0)); + + assert(ElementText != nullptr && "First preoperty of element is not "); + + RootSignatureElementKind ElementKind = + StringSwitch(ElementText->getString()) + .Case("RootFlags", RootSignatureElementKind::RootFlags) + .Case("RootConstants", RootSignatureElementKind::RootConstants) + .Case("RootCBV", RootSignatureElementKind::RootDescriptor) + .Case("RootSRV", RootSignatureElementKind::RootDescriptor) + .Case("RootUAV", RootSignatureElementKind::RootDescriptor) + .Case("Sampler", RootSignatureElementKind::RootDescriptor) + .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) + .Case("StaticSampler", RootSignatureElementKind::StaticSampler) + .Default(RootSignatureElementKind::None); + + switch (ElementKind) { + + case RootSignatureElementKind::RootFlags: { + return parseRootFlags(Element, Desc); + break; + } + + case RootSignatureElementKind::RootConstants: + case RootSignatureElementKind::RootDescriptor: + case RootSignatureElementKind::DescriptorTable: + case RootSignatureElementKind::StaticSampler: + case RootSignatureElementKind::None: + llvm_unreachable("Not Implemented yet"); + break; + } + + return true; +} + +bool parseRootSignature(RootSignatureDesc *Desc, int32_t Version, + NamedMDNode *Root) { + Desc->Version = Version; + bool HasError = false; + + for (unsigned int Sid = 0; Sid < Root->getNumOperands(); Sid++) { + // This should be an if, for error handling + MDNode *Node = cast(Root->getOperand(Sid)); + + // Not sure what use this for... + Metadata *Func = Node->getOperand(0).get(); + + // This should be an if, for error handling + MDNode *Elements = cast(Node->getOperand(1).get()); + + for (unsigned int Eid = 0; Eid < Elements->getNumOperands(); Eid++) { + MDNode *Element = cast(Elements->getOperand(Eid)); + + HasError = HasError || parseRootSignatureElement(Element, Desc); + } + } + return HasError; +} static ModuleMetadataInfo collectMetadataInfo(Module &M) { ModuleMetadataInfo MMDAI; @@ -28,6 +107,7 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) { MMDAI.DXILVersion = TT.getDXILVersion(); MMDAI.ShaderModelVersion = TT.getOSVersion(); MMDAI.ShaderProfile = TT.getEnvironment(); + NamedMDNode *ValidatorVerNode = M.getNamedMetadata("dx.valver"); if (ValidatorVerNode) { auto *ValVerMD = cast(ValidatorVerNode->getOperand(0)); @@ -37,6 +117,15 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) { VersionTuple(MajorMD->getZExtValue(), MinorMD->getZExtValue()); } + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); + if (RootSignatureNode) { + mcdxbc::RootSignatureDesc Desc; + + parseRootSignature(&Desc, 1, RootSignatureNode); + + MMDAI.RootSignatureDesc = Desc; + } + // For all HLSL Shader functions for (auto &F : M.functions()) { if (!F.hasFnAttribute("hlsl.shader")) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 5508af40663b1..e4007bafdd339 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/MC/DXContainerPSVInfo.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" @@ -153,27 +154,17 @@ void DXContainerGlobals::addSignature(Module &M, void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { - dxil::ModuleMetadataInfo &MMI = - getAnalysis().getModuleMetadata(); - - // Root Signature in Library don't compile to DXContainer. - if (MMI.ShaderProfile == llvm::Triple::Library) - return; - - assert(MMI.EntryPropertyVec.size() == 1); - - auto &RSA = getAnalysis(); - const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - const auto &FuncRs = RSA.find(EntryFunction); - - if (FuncRs == RSA.end()) + std::optional Desc = + getAnalysis() + .getModuleMetadata() + .RootSignatureDesc; + if (!Desc.has_value()) return; - const RootSignatureDesc &RS = FuncRs->second; SmallString<256> Data; raw_svector_ostream OS(Data); - - RS.write(OS); + RootSignatureDescWriter writer(&Desc.value()); + writer.write(OS); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); From f61ee775a32287fdfc3d200faa4dce850f3ff016 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 00:36:11 +0000 Subject: [PATCH 153/220] moving root signature to it's own pass --- .../llvm/Analysis/DXILMetadataAnalysis.h | 2 - llvm/lib/Analysis/DXILMetadataAnalysis.cpp | 84 ------ llvm/lib/MC/CMakeLists.txt | 1 - llvm/lib/Target/DirectX/CMakeLists.txt | 1 - .../lib/Target/DirectX/DXContainerGlobals.cpp | 14 +- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 240 ++++++------------ llvm/lib/Target/DirectX/DXILRootSignature.h | 80 +++--- .../ContainerData/RootSignature-Flags.ll | 29 ++- 8 files changed, 138 insertions(+), 313 deletions(-) diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h index f420244ba111a..dcc3237f57802 100644 --- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h +++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h @@ -11,7 +11,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/PassManager.h" -#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" @@ -39,7 +38,6 @@ struct ModuleMetadataInfo { Triple::EnvironmentType ShaderProfile{Triple::UnknownEnvironment}; VersionTuple ValidatorVersion{}; SmallVector EntryPropertyVec{}; - std::optional RootSignatureDesc; void print(raw_ostream &OS) const; }; diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp index 388e3853008ea..15e72bf17515b 100644 --- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp +++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp @@ -15,7 +15,6 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" -#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include @@ -24,82 +23,8 @@ using namespace llvm; using namespace dxil; -using namespace llvm::mcdxbc; -static bool parseRootFlags(MDNode *RootFlagNode, RootSignatureDesc *Desc) { - assert(RootFlagNode->getNumOperands() == 2 && - "Invalid format for RootFlag Element"); - auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - auto Value = (RootSignatureFlags)Flag->getZExtValue(); - - if ((Value & ~RootSignatureFlags::ValidFlags) != RootSignatureFlags::None) - return true; - - Desc->Flags = Value; - return false; -} - -static bool parseRootSignatureElement(MDNode *Element, - RootSignatureDesc *Desc) { - MDString *ElementText = cast(Element->getOperand(0)); - - assert(ElementText != nullptr && "First preoperty of element is not "); - - RootSignatureElementKind ElementKind = - StringSwitch(ElementText->getString()) - .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Case("RootConstants", RootSignatureElementKind::RootConstants) - .Case("RootCBV", RootSignatureElementKind::RootDescriptor) - .Case("RootSRV", RootSignatureElementKind::RootDescriptor) - .Case("RootUAV", RootSignatureElementKind::RootDescriptor) - .Case("Sampler", RootSignatureElementKind::RootDescriptor) - .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) - .Case("StaticSampler", RootSignatureElementKind::StaticSampler) - .Default(RootSignatureElementKind::None); - - switch (ElementKind) { - - case RootSignatureElementKind::RootFlags: { - return parseRootFlags(Element, Desc); - break; - } - - case RootSignatureElementKind::RootConstants: - case RootSignatureElementKind::RootDescriptor: - case RootSignatureElementKind::DescriptorTable: - case RootSignatureElementKind::StaticSampler: - case RootSignatureElementKind::None: - llvm_unreachable("Not Implemented yet"); - break; - } - - return true; -} - -bool parseRootSignature(RootSignatureDesc *Desc, int32_t Version, - NamedMDNode *Root) { - Desc->Version = Version; - bool HasError = false; - - for (unsigned int Sid = 0; Sid < Root->getNumOperands(); Sid++) { - // This should be an if, for error handling - MDNode *Node = cast(Root->getOperand(Sid)); - - // Not sure what use this for... - Metadata *Func = Node->getOperand(0).get(); - - // This should be an if, for error handling - MDNode *Elements = cast(Node->getOperand(1).get()); - - for (unsigned int Eid = 0; Eid < Elements->getNumOperands(); Eid++) { - MDNode *Element = cast(Elements->getOperand(Eid)); - - HasError = HasError || parseRootSignatureElement(Element, Desc); - } - } - return HasError; -} static ModuleMetadataInfo collectMetadataInfo(Module &M) { ModuleMetadataInfo MMDAI; @@ -117,15 +42,6 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) { VersionTuple(MajorMD->getZExtValue(), MinorMD->getZExtValue()); } - NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode) { - mcdxbc::RootSignatureDesc Desc; - - parseRootSignature(&Desc, 1, RootSignatureNode); - - MMDAI.RootSignatureDesc = Desc; - } - // For all HLSL Shader functions for (auto &F : M.functions()) { if (!F.hasFnAttribute("hlsl.shader")) diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index f49f14c848b90..e1d19196c8766 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -1,7 +1,6 @@ add_llvm_component_library(LLVMMC ConstantPools.cpp DXContainerPSVInfo.cpp - DXContainerRootSignature.cpp ELFObjectWriter.cpp GOFFObjectWriter.cpp MCAsmBackend.cpp diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt index 5a167535b0afa..89fe494dea71c 100644 --- a/llvm/lib/Target/DirectX/CMakeLists.txt +++ b/llvm/lib/Target/DirectX/CMakeLists.txt @@ -34,7 +34,6 @@ add_llvm_target(DirectXCodeGen DXILShaderFlags.cpp DXILTranslateMetadata.cpp DXILRootSignature.cpp - LINK_COMPONENTS Analysis AsmPrinter diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index e4007bafdd339..491f42f0413f2 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -12,6 +12,7 @@ #include "DXILRootSignature.h" #include "DXILShaderFlags.h" +#include "DXILRootSignature.h" #include "DirectX.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -24,7 +25,6 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/MC/DXContainerPSVInfo.h" -#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" @@ -154,17 +154,15 @@ void DXContainerGlobals::addSignature(Module &M, void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { - std::optional Desc = - getAnalysis() - .getModuleMetadata() - .RootSignatureDesc; - if (!Desc.has_value()) + std::optional MRS = + getAnalysis() + .getRootSignature(); + if (!MRS.has_value()) return; SmallString<256> Data; raw_svector_ostream OS(Data); - RootSignatureDescWriter writer(&Desc.value()); - writer.write(OS); + MRS->write(OS); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index fd390cdbf9057..4a51198d97ac3 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -1,4 +1,4 @@ -//===- DXILRootSignature.cpp - DXIL Root Signature helper objects -------===// +//===- DXILRootSignature.cpp - DXIL Root Signature helper objects ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,226 +13,134 @@ #include "DXILRootSignature.h" #include "DirectX.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include using namespace llvm; using namespace llvm::dxil; -static bool reportError(LLVMContext *Ctx, Twine Message, - DiagnosticSeverity Severity = DS_Error) { - Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); - return true; -} - -static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *RootFlagNode) { - - if (RootFlagNode->getNumOperands() != 2) - return reportError(Ctx, "Invalid format for RootFlag Element"); +static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { + assert(RootFlagNode->getNumOperands() == 2 && + "Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - RSD.Flags = Flag->getZExtValue(); + auto Value = Flag->getZExtValue(); + + // Root Element validation, as specified: https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation + if ((Value & ~0x80000fff) != 0) + return true; + MRS->Flags = Value; return false; } -static bool parseRootSignatureElement(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *Element) { +static bool parseRootSignatureElement(ModuleRootSignature *MRS, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); - if (ElementText == nullptr) - return reportError(Ctx, "Invalid format for Root Element"); + + assert(ElementText != nullptr && "First preoperty of element is not "); RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Default(RootSignatureElementKind::Error); + .Case("RootConstants", RootSignatureElementKind::RootConstants) + .Case("RootCBV", RootSignatureElementKind::RootDescriptor) + .Case("RootSRV", RootSignatureElementKind::RootDescriptor) + .Case("RootUAV", RootSignatureElementKind::RootDescriptor) + .Case("Sampler", RootSignatureElementKind::RootDescriptor) + .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) + .Case("StaticSampler", RootSignatureElementKind::StaticSampler) + .Default(RootSignatureElementKind::None); switch (ElementKind) { - case RootSignatureElementKind::RootFlags: - return parseRootFlags(Ctx, RSD, Element); - case RootSignatureElementKind::Error: - return reportError(Ctx, "Invalid Root Signature Element: " + - ElementText->getString()); + case RootSignatureElementKind::RootFlags: { + return parseRootFlags(MRS, Element); + break; } - llvm_unreachable("Unhandled RootSignatureElementKind enum."); -} - -static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *Node) { - bool HasError = false; - - // Loop through the Root Elements of the root signature. - for (const auto &Operand : Node->operands()) { - MDNode *Element = dyn_cast(Operand); - if (Element == nullptr) - return reportError(Ctx, "Missing Root Element Metadata Node."); - - HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element); + case RootSignatureElementKind::RootConstants: + case RootSignatureElementKind::RootDescriptor: + case RootSignatureElementKind::DescriptorTable: + case RootSignatureElementKind::StaticSampler: + case RootSignatureElementKind::None: + llvm_unreachable("Not Implemented yet"); + break; } - return HasError; -} - -static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { - if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Flags)) { - return reportError(Ctx, "Invalid Root Signature flag value"); - } - return false; + return true; } -static SmallDenseMap -analyzeModule(Module &M) { - - /** Root Signature are specified as following in the metadata: - - !dx.rootsignatures = !{!2} ; list of function/root signature pairs - !2 = !{ ptr @main, !3 } ; function, root signature - !3 = !{ !4, !5, !6, !7 } ; list of root signature elements - - So for each MDNode inside dx.rootsignatures NamedMDNode - (the Root parameter of this function), the parsing process needs - to loop through each of its operands and process the function, - signature pair. - */ - - LLVMContext *Ctx = &M.getContext(); +bool ModuleRootSignature::parse( int32_t Version, + NamedMDNode *Root) { + this->Version = Version; + bool HasError = false; - SmallDenseMap RSDMap; + for (unsigned int Sid = 0; Sid < Root->getNumOperands(); Sid++) { + // This should be an if, for error handling + MDNode *Node = cast(Root->getOperand(Sid)); - NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr) - return RSDMap; + // Not sure what use this for... + Metadata *Func = Node->getOperand(0).get(); - for (const auto &RSDefNode : RootSignatureNode->operands()) { - if (RSDefNode->getNumOperands() != 2) { - reportError(Ctx, "Invalid format for Root Signature Definition. Pairs " - "of function, root signature expected."); - continue; - } + // This should be an if, for error handling + MDNode *Elements = cast(Node->getOperand(1).get()); - // Function was pruned during compilation. - const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0); - if (FunctionPointerMdNode == nullptr) { - reportError( - Ctx, "Function associated with Root Signature definition is null."); - continue; - } + for (unsigned int Eid = 0; Eid < Elements->getNumOperands(); Eid++) { + MDNode *Element = cast(Elements->getOperand(Eid)); - ValueAsMetadata *VAM = - llvm::dyn_cast(FunctionPointerMdNode.get()); - if (VAM == nullptr) { - reportError(Ctx, "First element of root signature is not a Value"); - continue; + HasError = HasError || parseRootSignatureElement(this, Element); } + } + return HasError; +} - Function *F = dyn_cast(VAM->getValue()); - if (F == nullptr) { - reportError(Ctx, "First element of root signature is not a Function"); - continue; - } +void ModuleRootSignature::write(raw_ostream &OS) { + dxbc::RootSignatureDesc Out{this->Version, this->Flags}; - Metadata *RootElementListOperand = RSDefNode->getOperand(1).get(); + if (sys::IsBigEndianHost) { + Out.swapBytes(); + } - if (RootElementListOperand == nullptr) { - reportError(Ctx, "Root Element mdnode is null."); - continue; - } + OS.write(reinterpret_cast(&Out), sizeof(dxbc::RootSignatureDesc)); +} - MDNode *RootElementListNode = dyn_cast(RootElementListOperand); - if (RootElementListNode == nullptr) { - reportError(Ctx, "Root Element is not a metadata node."); - continue; - } +AnalysisKey RootSignatureAnalysis::Key; - mcdxbc::RootSignatureDesc RSD; +ModuleRootSignature RootSignatureAnalysis::run(Module &M, + ModuleAnalysisManager &AM) { + ModuleRootSignature MRSI; - if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) { - return RSDMap; + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); + if (RootSignatureNode) { + MRSI.parse(1, RootSignatureNode); } - RSDMap.insert(std::make_pair(F, RSD)); - } + return MRSI; - return RSDMap; } -AnalysisKey RootSignatureAnalysis::Key; - -SmallDenseMap -RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { - return analyzeModule(M); -} //===----------------------------------------------------------------------===// +bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { + ModuleRootSignature MRS; -PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, - ModuleAnalysisManager &AM) { - - SmallDenseMap &RSDMap = - AM.getResult(M); - OS << "Root Signature Definitions" - << "\n"; - uint8_t Space = 0; - for (const Function &F : M) { - auto It = RSDMap.find(&F); - if (It == RSDMap.end()) - continue; - const auto &RS = It->second; - OS << "Definition for '" << F.getName() << "':\n"; - - // start root signature header - Space++; - OS << indent(Space) << "Flags: " << format_hex(RS.Flags, 8) << ":\n"; - OS << indent(Space) << "Version: " << RS.Version << ":\n"; - OS << indent(Space) << "NumParameters: " << RS.NumParameters << ":\n"; - OS << indent(Space) << "RootParametersOffset: " << RS.RootParametersOffset - << ":\n"; - OS << indent(Space) << "NumStaticSamplers: " << RS.NumStaticSamplers - << ":\n"; - OS << indent(Space) << "StaticSamplersOffset: " << RS.StaticSamplersOffset - << ":\n"; - Space--; - // end root signature header - } + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); + if (RootSignatureNode) { + MRS.parse(1, RootSignatureNode); + this->MRS = MRS; + } - return PreservedAnalyses::all(); -} -//===----------------------------------------------------------------------===// -bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { - FuncToRsMap = analyzeModule(M); - return false; + return false; } void RootSignatureAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); } char RootSignatureAnalysisWrapper::ID = 0; -INITIALIZE_PASS_BEGIN(RootSignatureAnalysisWrapper, - "dxil-root-signature-analysis", - "DXIL Root Signature Analysis", true, true) -INITIALIZE_PASS_END(RootSignatureAnalysisWrapper, - "dxil-root-signature-analysis", - "DXIL Root Signature Analysis", true, true) +INITIALIZE_PASS(RootSignatureAnalysisWrapper, "dx-root-signature-analysis", + "DXIL Root Signature Analysis", true, true) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 8c25b2eb3fadf..fdfd6c41c0af3 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -1,4 +1,4 @@ -//===- DXILRootSignature.h - DXIL Root Signature helper objects -----------===// +//===- DXILRootSignature.h - DXIL Root Signature helper objects ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,67 +11,65 @@ /// //===----------------------------------------------------------------------===// -#include "llvm/ADT/DenseMap.h" -#include "llvm/Analysis/DXILMetadataAnalysis.h" -#include "llvm/IR/DiagnosticInfo.h" + #include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" -#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include namespace llvm { namespace dxil { -enum class RootSignatureElementKind { Error = 0, RootFlags = 1 }; -class RootSignatureAnalysis : public AnalysisInfoMixin { - friend AnalysisInfoMixin; - static AnalysisKey Key; -public: - RootSignatureAnalysis() = default; + enum class RootSignatureElementKind { + None = 0, + RootFlags = 1, + RootConstants = 2, + RootDescriptor = 3, + DescriptorTable = 4, + StaticSampler = 5 + }; - using Result = SmallDenseMap; + struct ModuleRootSignature { + uint32_t Version; + uint32_t Flags; - SmallDenseMap - run(Module &M, ModuleAnalysisManager &AM); -}; + ModuleRootSignature() = default; -/// Wrapper pass for the legacy pass manager. -/// -/// This is required because the passes that will depend on this are codegen -/// passes which run through the legacy pass manager. -class RootSignatureAnalysisWrapper : public ModulePass { -private: - SmallDenseMap FuncToRsMap; + bool parse( int32_t Version, NamedMDNode *Root); + void write(raw_ostream &OS); + }; + + class RootSignatureAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; -public: - static char ID; + public: + RootSignatureAnalysis() = default; - RootSignatureAnalysisWrapper() : ModulePass(ID) {} + using Result = ModuleRootSignature; - using iterator = - SmallDenseMap::iterator; + ModuleRootSignature run(Module &M, ModuleAnalysisManager &AM); + }; - iterator find(const Function *F) { return FuncToRsMap.find(F); } + /// Wrapper pass for the legacy pass manager. + /// + /// This is required because the passes that will depend on this are codegen + /// passes which run through the legacy pass manager. + class RootSignatureAnalysisWrapper : public ModulePass { + std::optional MRS; - iterator end() { return FuncToRsMap.end(); } + public: + static char ID; - bool runOnModule(Module &M) override; + RootSignatureAnalysisWrapper() : ModulePass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; + const std::optional &getRootSignature() { return MRS; } -/// Printer pass for RootSignatureAnalysis results. -class RootSignatureAnalysisPrinter - : public PassInfoMixin { - raw_ostream &OS; + bool runOnModule(Module &M) override; -public: - explicit RootSignatureAnalysisPrinter(raw_ostream &OS) : OS(OS) {} - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; + void getAnalysisUsage(AnalysisUsage &AU) const override; + }; } // namespace dxil } // namespace llvm diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index 3f5bb166ad0e5..ffbf5e9ffd1d3 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -3,12 +3,14 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: @dx.rts0 = private constant [24 x i8] c"{{.*}}", section "RTS0", align 4 +; CHECK: @dx.rts0 = private constant [8 x i8] c"{{.*}}", section "RTS0", align 4 + define void @main() #0 { entry: ret void } + attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } @@ -18,12 +20,19 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout -; DXC: - Name: RTS0 -; DXC-NEXT: Size: 24 -; DXC-NEXT: RootSignature: -; DXC-NEXT: Version: 2 -; DXC-NEXT: NumParameters: 0 -; DXC-NEXT: RootParametersOffset: 0 -; DXC-NEXT: NumStaticSamplers: 0 -; DXC-NEXT: StaticSamplersOffset: 0 -; DXC-NEXT: AllowInputAssemblerInputLayout: true +; DXC: - Name: RTS0 +; DXC-NEXT: Size: 8 +; DXC-NEXT: RootSignature: +; DXC-NEXT: Version: 1 +; DXC-NEXT: AllowInputAssemblerInputLayout: true +; DXC-NEXT: DenyVertexShaderRootAccess: false +; DXC-NEXT: DenyHullShaderRootAccess: false +; DXC-NEXT: DenyDomainShaderRootAccess: false +; DXC-NEXT: DenyGeometryShaderRootAccess: false +; DXC-NEXT: DenyPixelShaderRootAccess: false +; DXC-NEXT: AllowStreamOutput: false +; DXC-NEXT: LocalRootSignature: false +; DXC-NEXT: DenyAmplificationShaderRootAccess: false +; DXC-NEXT: DenyMeshShaderRootAccess: false +; DXC-NEXT: CBVSRVUAVHeapDirectlyIndexed: false +; DXC-NEXT: SamplerHeapDirectlyIndexed: false From 499d87963d9562f42079f4ef216f8f21b018f4e4 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 00:37:14 +0000 Subject: [PATCH 154/220] formating --- llvm/lib/Analysis/DXILMetadataAnalysis.cpp | 2 - .../lib/Target/DirectX/DXContainerGlobals.cpp | 4 +- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 48 ++++++------ llvm/lib/Target/DirectX/DXILRootSignature.h | 77 +++++++++---------- 4 files changed, 63 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp index 15e72bf17515b..197b7e422092c 100644 --- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp +++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp @@ -24,8 +24,6 @@ using namespace llvm; using namespace dxil; - - static ModuleMetadataInfo collectMetadataInfo(Module &M) { ModuleMetadataInfo MMDAI; Triple TT(Triple(M.getTargetTriple())); diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 491f42f0413f2..853baffcb28ec 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -12,7 +12,6 @@ #include "DXILRootSignature.h" #include "DXILShaderFlags.h" -#include "DXILRootSignature.h" #include "DirectX.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -155,8 +154,7 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { std::optional MRS = - getAnalysis() - .getRootSignature(); + getAnalysis().getRootSignature(); if (!MRS.has_value()) return; diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 4a51198d97ac3..89621868a9336 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -1,4 +1,5 @@ -//===- DXILRootSignature.cpp - DXIL Root Signature helper objects ---------------===// +//===- DXILRootSignature.cpp - DXIL Root Signature helper objects +//---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,7 +29,8 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); auto Value = Flag->getZExtValue(); - // Root Element validation, as specified: https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation + // Root Element validation, as specified: + // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation if ((Value & ~0x80000fff) != 0) return true; @@ -36,7 +38,8 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { return false; } -static bool parseRootSignatureElement(ModuleRootSignature *MRS, MDNode *Element) { +static bool parseRootSignatureElement(ModuleRootSignature *MRS, + MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); assert(ElementText != nullptr && "First preoperty of element is not "); @@ -72,8 +75,7 @@ static bool parseRootSignatureElement(ModuleRootSignature *MRS, MDNode *Element) return true; } -bool ModuleRootSignature::parse( int32_t Version, - NamedMDNode *Root) { +bool ModuleRootSignature::parse(int32_t Version, NamedMDNode *Root) { this->Version = Version; bool HasError = false; @@ -103,37 +105,35 @@ void ModuleRootSignature::write(raw_ostream &OS) { Out.swapBytes(); } - OS.write(reinterpret_cast(&Out), sizeof(dxbc::RootSignatureDesc)); + OS.write(reinterpret_cast(&Out), + sizeof(dxbc::RootSignatureDesc)); } AnalysisKey RootSignatureAnalysis::Key; ModuleRootSignature RootSignatureAnalysis::run(Module &M, - ModuleAnalysisManager &AM) { - ModuleRootSignature MRSI; + ModuleAnalysisManager &AM) { + ModuleRootSignature MRSI; - NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode) { - MRSI.parse(1, RootSignatureNode); - } - - return MRSI; + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); + if (RootSignatureNode) { + MRSI.parse(1, RootSignatureNode); + } + return MRSI; } - //===----------------------------------------------------------------------===// bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { ModuleRootSignature MRS; - NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode) { - MRS.parse(1, RootSignatureNode); - this->MRS = MRS; - } - + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); + if (RootSignatureNode) { + MRS.parse(1, RootSignatureNode); + this->MRS = MRS; + } - return false; + return false; } void RootSignatureAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { @@ -142,5 +142,5 @@ void RootSignatureAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { char RootSignatureAnalysisWrapper::ID = 0; -INITIALIZE_PASS(RootSignatureAnalysisWrapper, "dx-root-signature-analysis", - "DXIL Root Signature Analysis", true, true) +INITIALIZE_PASS(RootSignatureAnalysisWrapper, "dx-root-signature-analysis", + "DXIL Root Signature Analysis", true, true) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index fdfd6c41c0af3..de82afcdc8c46 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -1,4 +1,5 @@ -//===- DXILRootSignature.h - DXIL Root Signature helper objects ---------------===// +//===- DXILRootSignature.h - DXIL Root Signature helper objects +//---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,7 +12,6 @@ /// //===----------------------------------------------------------------------===// - #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -20,56 +20,55 @@ namespace llvm { namespace dxil { +enum class RootSignatureElementKind { + None = 0, + RootFlags = 1, + RootConstants = 2, + RootDescriptor = 3, + DescriptorTable = 4, + StaticSampler = 5 +}; - enum class RootSignatureElementKind { - None = 0, - RootFlags = 1, - RootConstants = 2, - RootDescriptor = 3, - DescriptorTable = 4, - StaticSampler = 5 - }; - - struct ModuleRootSignature { - uint32_t Version; - uint32_t Flags; +struct ModuleRootSignature { + uint32_t Version; + uint32_t Flags; - ModuleRootSignature() = default; + ModuleRootSignature() = default; - bool parse( int32_t Version, NamedMDNode *Root); - void write(raw_ostream &OS); - }; + bool parse(int32_t Version, NamedMDNode *Root); + void write(raw_ostream &OS); +}; - class RootSignatureAnalysis : public AnalysisInfoMixin { - friend AnalysisInfoMixin; - static AnalysisKey Key; +class RootSignatureAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; - public: - RootSignatureAnalysis() = default; +public: + RootSignatureAnalysis() = default; - using Result = ModuleRootSignature; + using Result = ModuleRootSignature; - ModuleRootSignature run(Module &M, ModuleAnalysisManager &AM); - }; + ModuleRootSignature run(Module &M, ModuleAnalysisManager &AM); +}; - /// Wrapper pass for the legacy pass manager. - /// - /// This is required because the passes that will depend on this are codegen - /// passes which run through the legacy pass manager. - class RootSignatureAnalysisWrapper : public ModulePass { - std::optional MRS; +/// Wrapper pass for the legacy pass manager. +/// +/// This is required because the passes that will depend on this are codegen +/// passes which run through the legacy pass manager. +class RootSignatureAnalysisWrapper : public ModulePass { + std::optional MRS; - public: - static char ID; +public: + static char ID; - RootSignatureAnalysisWrapper() : ModulePass(ID) {} + RootSignatureAnalysisWrapper() : ModulePass(ID) {} - const std::optional &getRootSignature() { return MRS; } + const std::optional &getRootSignature() { return MRS; } - bool runOnModule(Module &M) override; + bool runOnModule(Module &M) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - }; + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; } // namespace dxil } // namespace llvm From c4af535b203e282ddc20f0efdf02865fb5c57991 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 00:42:54 +0000 Subject: [PATCH 155/220] removing useless imports --- llvm/include/llvm/Analysis/DXILMetadataAnalysis.h | 1 - llvm/lib/Analysis/DXILMetadataAnalysis.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h index dcc3237f57802..cb535ac14f1c6 100644 --- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h +++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h @@ -14,7 +14,6 @@ #include "llvm/Pass.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" -#include namespace llvm { diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp index 197b7e422092c..a7f666a3f8b48 100644 --- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp +++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp @@ -15,9 +15,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include #define DEBUG_TYPE "dxil-metadata-analysis" @@ -30,7 +28,6 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) { MMDAI.DXILVersion = TT.getDXILVersion(); MMDAI.ShaderModelVersion = TT.getOSVersion(); MMDAI.ShaderProfile = TT.getEnvironment(); - NamedMDNode *ValidatorVerNode = M.getNamedMetadata("dx.valver"); if (ValidatorVerNode) { auto *ValVerMD = cast(ValidatorVerNode->getOperand(0)); From 819fa0d125b49c79e1217368e1515180463f7019 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 19:22:31 +0000 Subject: [PATCH 156/220] fixing pr changes --- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 0869fd4fa9785..afcc093cf0456 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/ScopedPrinter.h" +#include namespace llvm { From d347a87b720f56b3ce7dffb5a6c9f2bbc6e0eb49 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 20:06:13 +0000 Subject: [PATCH 157/220] adding some asserts --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 89621868a9336..024743b9f81a6 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include using namespace llvm; using namespace llvm::dxil; @@ -31,8 +32,7 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { // Root Element validation, as specified: // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation - if ((Value & ~0x80000fff) != 0) - return true; + assert((Value & ~0x80000fff) != 0 && "Invalid flag for RootFlag Element"); MRS->Flags = Value; return false; @@ -41,8 +41,7 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { static bool parseRootSignatureElement(ModuleRootSignature *MRS, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); - - assert(ElementText != nullptr && "First preoperty of element is not "); + assert(ElementText != nullptr && "First preoperty of element is not a string"); RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) @@ -84,13 +83,14 @@ bool ModuleRootSignature::parse(int32_t Version, NamedMDNode *Root) { MDNode *Node = cast(Root->getOperand(Sid)); // Not sure what use this for... - Metadata *Func = Node->getOperand(0).get(); + // Metadata *Func = Node->getOperand(0).get(); - // This should be an if, for error handling MDNode *Elements = cast(Node->getOperand(1).get()); + assert(Elements && "Invalid Metadata type on root signature"); for (unsigned int Eid = 0; Eid < Elements->getNumOperands(); Eid++) { MDNode *Element = cast(Elements->getOperand(Eid)); + assert(Element && "Invalid Metadata type on root element"); HasError = HasError || parseRootSignatureElement(this, Element); } From d8824ed95e108becc210c1722fcc3204bd456c68 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 16 Jan 2025 20:11:36 +0000 Subject: [PATCH 158/220] format --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 024743b9f81a6..cabaec3671078 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -41,7 +41,8 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { static bool parseRootSignatureElement(ModuleRootSignature *MRS, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); - assert(ElementText != nullptr && "First preoperty of element is not a string"); + assert(ElementText != nullptr && + "First preoperty of element is not a string"); RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) From 25c03846822d9e0842fa6a514f254169abd7b3c6 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 18 Jan 2025 00:24:53 +0000 Subject: [PATCH 159/220] fixing assert --- llvm/lib/MC/CMakeLists.txt | 1 + llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 7 ++++++- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 13 +------------ llvm/lib/Target/DirectX/DXILRootSignature.h | 1 - .../DirectX/ContainerData/RootSignature-Flags.ll | 16 +++------------- 5 files changed, 11 insertions(+), 27 deletions(-) diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index e1d19196c8766..f49f14c848b90 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMMC ConstantPools.cpp DXContainerPSVInfo.cpp + DXContainerRootSignature.cpp ELFObjectWriter.cpp GOFFObjectWriter.cpp MCAsmBackend.cpp diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 853baffcb28ec..5e6da115b9a8a 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/MC/DXContainerPSVInfo.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" @@ -160,7 +161,11 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallString<256> Data; raw_svector_ostream OS(Data); - MRS->write(OS); + + RootSignatureHeader RSH; + RSH.Flags = MRS->Flags; + RSH.Version = MRS->Version; + RSH.write(OS); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index cabaec3671078..5ee9eea68b9e6 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -32,7 +32,7 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { // Root Element validation, as specified: // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation - assert((Value & ~0x80000fff) != 0 && "Invalid flag for RootFlag Element"); + assert((Value & ~0x80000fff) == 0 && "Invalid flag for RootFlag Element"); MRS->Flags = Value; return false; @@ -99,17 +99,6 @@ bool ModuleRootSignature::parse(int32_t Version, NamedMDNode *Root) { return HasError; } -void ModuleRootSignature::write(raw_ostream &OS) { - dxbc::RootSignatureDesc Out{this->Version, this->Flags}; - - if (sys::IsBigEndianHost) { - Out.swapBytes(); - } - - OS.write(reinterpret_cast(&Out), - sizeof(dxbc::RootSignatureDesc)); -} - AnalysisKey RootSignatureAnalysis::Key; ModuleRootSignature RootSignatureAnalysis::run(Module &M, diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index de82afcdc8c46..3bbbaa12b0798 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -36,7 +36,6 @@ struct ModuleRootSignature { ModuleRootSignature() = default; bool parse(int32_t Version, NamedMDNode *Root); - void write(raw_ostream &OS); }; class RootSignatureAnalysis : public AnalysisInfoMixin { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index ffbf5e9ffd1d3..20253efbb8e5c 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -3,7 +3,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: @dx.rts0 = private constant [8 x i8] c"{{.*}}", section "RTS0", align 4 +; CHECK: @dx.rts0 = private constant [12 x i8] c"{{.*}}", section "RTS0", align 4 define void @main() #0 { @@ -21,18 +21,8 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; DXC: - Name: RTS0 -; DXC-NEXT: Size: 8 +; DXC-NEXT: Size: 12 ; DXC-NEXT: RootSignature: +; DXC-NEXT: Size: 8 ; DXC-NEXT: Version: 1 ; DXC-NEXT: AllowInputAssemblerInputLayout: true -; DXC-NEXT: DenyVertexShaderRootAccess: false -; DXC-NEXT: DenyHullShaderRootAccess: false -; DXC-NEXT: DenyDomainShaderRootAccess: false -; DXC-NEXT: DenyGeometryShaderRootAccess: false -; DXC-NEXT: DenyPixelShaderRootAccess: false -; DXC-NEXT: AllowStreamOutput: false -; DXC-NEXT: LocalRootSignature: false -; DXC-NEXT: DenyAmplificationShaderRootAccess: false -; DXC-NEXT: DenyMeshShaderRootAccess: false -; DXC-NEXT: CBVSRVUAVHeapDirectlyIndexed: false -; DXC-NEXT: SamplerHeapDirectlyIndexed: false From 5eb0ad25048b5ee918beb74c7d0421927b8136f3 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 27 Jan 2025 23:45:45 +0000 Subject: [PATCH 160/220] cleaning --- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 1 - llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index afcc093cf0456..0869fd4fa9785 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/ScopedPrinter.h" -#include namespace llvm { diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 5e6da115b9a8a..52d16f45b9b8f 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -29,7 +29,6 @@ #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include using namespace llvm; using namespace llvm::dxil; From 559427db09a41e31da5d15ebbd24f4310fc7e526 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 29 Jan 2025 18:36:55 +0000 Subject: [PATCH 161/220] clean up --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 5ee9eea68b9e6..71ca8a91bc3fe 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -14,11 +14,9 @@ #include "DXILRootSignature.h" #include "DirectX.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include using namespace llvm; using namespace llvm::dxil; From 8ca5b2acca09bdc1658a66798acf3d5604ba701b Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 30 Jan 2025 00:22:01 +0000 Subject: [PATCH 162/220] addressing comments --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 89 ++++++++++++------- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 + .../ContainerData/RootSignature-Error.ll | 5 +- .../RootSignature-Flags-Error.ll | 5 +- .../RootSignature-Flags-Validation-Error.ll | 7 +- .../RootSignature-RootElement-Error.ll | 5 +- 6 files changed, 68 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 71ca8a91bc3fe..52c7ad8e24937 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -1,5 +1,4 @@ -//===- DXILRootSignature.cpp - DXIL Root Signature helper objects -//---------------===// +//===- DXILRootSignature.cpp - DXIL Root Signature helper objects ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,23 +13,31 @@ #include "DXILRootSignature.h" #include "DirectX.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include using namespace llvm; using namespace llvm::dxil; +static bool reportError(Twine Message) { + report_fatal_error(Message, false); + return true; +} + static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { - assert(RootFlagNode->getNumOperands() == 2 && - "Invalid format for RootFlag Element"); + if (RootFlagNode->getNumOperands() != 2) + return reportError("Invalid format for RootFlag Element"); + auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - auto Value = Flag->getZExtValue(); + uint32_t Value = Flag->getZExtValue(); // Root Element validation, as specified: // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation - assert((Value & ~0x80000fff) == 0 && "Invalid flag for RootFlag Element"); + if ((Value & ~0x80000fff) != 0) + return reportError("Invalid flag value for RootFlag"); MRS->Flags = Value; return false; @@ -39,8 +46,8 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { static bool parseRootSignatureElement(ModuleRootSignature *MRS, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); - assert(ElementText != nullptr && - "First preoperty of element is not a string"); + if (ElementText == nullptr) + return reportError("Invalid format for Root Element"); RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) @@ -66,7 +73,7 @@ static bool parseRootSignatureElement(ModuleRootSignature *MRS, case RootSignatureElementKind::DescriptorTable: case RootSignatureElementKind::StaticSampler: case RootSignatureElementKind::None: - llvm_unreachable("Not Implemented yet"); + return reportError("Invalid Root Element: " + ElementText->getString()); break; } @@ -77,19 +84,37 @@ bool ModuleRootSignature::parse(int32_t Version, NamedMDNode *Root) { this->Version = Version; bool HasError = false; + /** Root Signature are specified as following in the metadata: + + !dx.rootsignatures = !{!2} ; list of function/root signature pairs + !2 = !{ ptr @main, !3 } ; function, root signature + !3 = !{ !4, !5, !6, !7 } ; list of root signature elements + + So for each MDNode inside dx.rootsignatures NamedMDNode + (the Root parameter of this function), the parsing process needs + to loop through each of it's operand and process the pairs function + signature pair. + */ + for (unsigned int Sid = 0; Sid < Root->getNumOperands(); Sid++) { - // This should be an if, for error handling - MDNode *Node = cast(Root->getOperand(Sid)); + MDNode *Node = dyn_cast(Root->getOperand(Sid)); + + if (Node == nullptr || Node->getNumOperands() != 2) + return reportError("Invalid format for Root Signature Definition. Pairs " + "of function, root signature expected."); + + // Get the Root Signature Description from the function signature pair. + MDNode *RS = dyn_cast(Node->getOperand(1).get()); - // Not sure what use this for... - // Metadata *Func = Node->getOperand(0).get(); + if (RS == nullptr) + return reportError("Missing Root Signature Metadata node."); - MDNode *Elements = cast(Node->getOperand(1).get()); - assert(Elements && "Invalid Metadata type on root signature"); + // Loop through the Root Elements of the root signature. + for (unsigned int Eid = 0; Eid < RS->getNumOperands(); Eid++) { - for (unsigned int Eid = 0; Eid < Elements->getNumOperands(); Eid++) { - MDNode *Element = cast(Elements->getOperand(Eid)); - assert(Element && "Invalid Metadata type on root element"); + MDNode *Element = dyn_cast(RS->getOperand(Eid)); + if (Element == nullptr) + return reportError("Missing Root Element Metadata Node."); HasError = HasError || parseRootSignatureElement(this, Element); } @@ -97,29 +122,29 @@ bool ModuleRootSignature::parse(int32_t Version, NamedMDNode *Root) { return HasError; } -AnalysisKey RootSignatureAnalysis::Key; - -ModuleRootSignature RootSignatureAnalysis::run(Module &M, - ModuleAnalysisManager &AM) { - ModuleRootSignature MRSI; +ModuleRootSignature ModuleRootSignature::analyzeModule(Module &M) { + ModuleRootSignature MRS; NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); if (RootSignatureNode) { - MRSI.parse(1, RootSignatureNode); + if (MRS.parse(1, RootSignatureNode)) + llvm_unreachable("Invalid Root Signature Metadata."); } - return MRSI; + return MRS; +} + +AnalysisKey RootSignatureAnalysis::Key; + +ModuleRootSignature RootSignatureAnalysis::run(Module &M, + ModuleAnalysisManager &AM) { + return ModuleRootSignature::analyzeModule(M); } //===----------------------------------------------------------------------===// bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { - ModuleRootSignature MRS; - NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode) { - MRS.parse(1, RootSignatureNode); - this->MRS = MRS; - } + this->MRS = MRS = ModuleRootSignature::analyzeModule(M); return false; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 3bbbaa12b0798..0439deea6451a 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -36,6 +36,8 @@ struct ModuleRootSignature { ModuleRootSignature() = default; bool parse(int32_t Version, NamedMDNode *Root); + + static ModuleRootSignature analyzeModule(Module &M); }; class RootSignatureAnalysis : public AnalysisInfoMixin { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll index 2a2188b1a13bb..cbcd8e56c1c04 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll @@ -1,9 +1,8 @@ -; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: error: Invalid format for Root Signature Definition. Pairs of function, root signature expected. -; CHECK-NOT: Root Signature Definitions +; CHECK: LLVM ERROR: Invalid format for Root Signature Definition. Pairs of function, root signature expected. define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll index 4921472d253ad..9b4208011bba5 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -1,9 +1,8 @@ -; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: error: Invalid Root Signature Element: NOTRootFlags -; CHECK-NOT: Root Signature Definitions +; CHECK: LLVM ERROR: Invalid Root Element: NOTRootFlags define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll index fe93c9993c1c3..85e6f4d6748d5 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll @@ -1,10 +1,9 @@ -; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s - -; CHECK: error: Invalid Root Signature flag value -; CHECK-NOT: Root Signature Definitions +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" +; CHECK: LLVM ERROR: Invalid flag value for RootFlag + define void @main() #0 { entry: diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll index 89e23f6540c5f..501e3438943a3 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll @@ -1,9 +1,8 @@ -; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: error: Missing Root Element Metadata Node. -; CHECK-NOT: Root Signature Definitions +; CHECK: LLVM ERROR: Missing Root Element Metadata Node. define void @main() #0 { From d52cd2c0e39780b9d3e476693e57fc6762c9cf49 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 30 Jan 2025 18:33:23 +0000 Subject: [PATCH 163/220] removing version --- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 2 +- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 5 ++--- llvm/lib/Target/DirectX/DXILRootSignature.h | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 52d16f45b9b8f..abc4bd09fc7c0 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -163,7 +163,7 @@ void DXContainerGlobals::addRootSignature(Module &M, RootSignatureHeader RSH; RSH.Flags = MRS->Flags; - RSH.Version = MRS->Version; + RSH.write(OS); Constant *Constant = diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 52c7ad8e24937..c86be5bd9eb67 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -80,8 +80,7 @@ static bool parseRootSignatureElement(ModuleRootSignature *MRS, return true; } -bool ModuleRootSignature::parse(int32_t Version, NamedMDNode *Root) { - this->Version = Version; +bool ModuleRootSignature::parse(NamedMDNode *Root) { bool HasError = false; /** Root Signature are specified as following in the metadata: @@ -127,7 +126,7 @@ ModuleRootSignature ModuleRootSignature::analyzeModule(Module &M) { NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); if (RootSignatureNode) { - if (MRS.parse(1, RootSignatureNode)) + if (MRS.parse(RootSignatureNode)) llvm_unreachable("Invalid Root Signature Metadata."); } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 0439deea6451a..f89fb0f00b5a4 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -30,12 +30,11 @@ enum class RootSignatureElementKind { }; struct ModuleRootSignature { - uint32_t Version; uint32_t Flags; ModuleRootSignature() = default; - bool parse(int32_t Version, NamedMDNode *Root); + bool parse(NamedMDNode *Root); static ModuleRootSignature analyzeModule(Module &M); }; From 5930dcb81d37b4ca42d64fb7ba5ff70630195882 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 30 Jan 2025 22:29:30 +0000 Subject: [PATCH 164/220] fix test --- .../CodeGen/DirectX/ContainerData/RootSignature-Flags.ll | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index 20253efbb8e5c..b44d31c5b3857 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -3,7 +3,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: @dx.rts0 = private constant [12 x i8] c"{{.*}}", section "RTS0", align 4 +; CHECK: @dx.rts0 = private constant [8 x i8] c"{{.*}}", section "RTS0", align 4 define void @main() #0 { @@ -11,6 +11,9 @@ entry: ret void } + + + attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } @@ -21,8 +24,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; DXC: - Name: RTS0 -; DXC-NEXT: Size: 12 +; DXC-NEXT: Size: 8 ; DXC-NEXT: RootSignature: ; DXC-NEXT: Size: 8 -; DXC-NEXT: Version: 1 ; DXC-NEXT: AllowInputAssemblerInputLayout: true From fc72988d114bd5b1f2f12deb383cdfd3a62ac0b2 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 31 Jan 2025 00:42:33 +0000 Subject: [PATCH 165/220] addressing PR Comments --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 5 ++--- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index c86be5bd9eb67..109069eb66dea 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -95,10 +95,9 @@ bool ModuleRootSignature::parse(NamedMDNode *Root) { signature pair. */ - for (unsigned int Sid = 0; Sid < Root->getNumOperands(); Sid++) { - MDNode *Node = dyn_cast(Root->getOperand(Sid)); + for (const MDNode *Node : Root->operands()) { - if (Node == nullptr || Node->getNumOperands() != 2) + if (Node->getNumOperands() != 2) return reportError("Invalid format for Root Signature Definition. Pairs " "of function, root signature expected."); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index f89fb0f00b5a4..5bbea29d22ae5 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -30,7 +30,7 @@ enum class RootSignatureElementKind { }; struct ModuleRootSignature { - uint32_t Flags; + uint32_t Flags = 0; ModuleRootSignature() = default; From 2d1ee0d198d046591c81249792757f0dabdaa7bc Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 3 Feb 2025 21:32:59 +0000 Subject: [PATCH 166/220] fix test --- .../DirectX/ContainerData/RootSignature-Flags.ll | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index b44d31c5b3857..c3e38c44c6194 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -3,7 +3,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: @dx.rts0 = private constant [8 x i8] c"{{.*}}", section "RTS0", align 4 +; CHECK: @dx.rts0 = private constant [24 x i8] c"{{.*}}", section "RTS0", align 4 define void @main() #0 { @@ -23,8 +23,12 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout -; DXC: - Name: RTS0 -; DXC-NEXT: Size: 8 -; DXC-NEXT: RootSignature: -; DXC-NEXT: Size: 8 -; DXC-NEXT: AllowInputAssemblerInputLayout: true +; DXC: - Name: RTS0 +; DXC-NEXT: Size: 24 +; DXC-NEXT: RootSignature: +; DXC-NEXT: Version: 2 +; DXC-NEXT: NumParameters: 0 +; DXC-NEXT: RootParametersOffset: 0 +; DXC-NEXT: NumStaticSamplers: 0 +; DXC-NEXT: StaticSamplersOffset: 0 +; DXC-NEXT: AllowInputAssemblerInputLayout: true From 92a85fe6160b82a55d6e7845ceb38fc3aca7f3d2 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 00:07:43 +0000 Subject: [PATCH 167/220] filtering root signatures not associated with entry function --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 57 ++++++++++++++++--- llvm/lib/Target/DirectX/DXILRootSignature.h | 4 +- .../ContainerData/RootSignature-Flags.ll | 1 - 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 109069eb66dea..984505b3fb85b 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -14,9 +14,12 @@ #include "DirectX.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" using namespace llvm; using namespace llvm::dxil; @@ -80,7 +83,7 @@ static bool parseRootSignatureElement(ModuleRootSignature *MRS, return true; } -bool ModuleRootSignature::parse(NamedMDNode *Root) { +bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { bool HasError = false; /** Root Signature are specified as following in the metadata: @@ -96,11 +99,25 @@ bool ModuleRootSignature::parse(NamedMDNode *Root) { */ for (const MDNode *Node : Root->operands()) { - if (Node->getNumOperands() != 2) return reportError("Invalid format for Root Signature Definition. Pairs " "of function, root signature expected."); + Metadata *MD = Node->getOperand(0).get(); + if (auto *VAM = llvm::dyn_cast(MD)) { + llvm::Value *V = VAM->getValue(); + if (Function *F = dyn_cast(V)) { + if (F != EF) + continue; + } else { + return reportError( + "Root Signature MD node, first element is not a function."); + } + } else { + return reportError( + "Root Signature MD node, first element is not a function."); + } + // Get the Root Signature Description from the function signature pair. MDNode *RS = dyn_cast(Node->getOperand(1).get()); @@ -120,12 +137,13 @@ bool ModuleRootSignature::parse(NamedMDNode *Root) { return HasError; } -ModuleRootSignature ModuleRootSignature::analyzeModule(Module &M) { +ModuleRootSignature ModuleRootSignature::analyzeModule(Module &M, + const Function *F) { ModuleRootSignature MRS; NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); if (RootSignatureNode) { - if (MRS.parse(RootSignatureNode)) + if (MRS.parse(RootSignatureNode, F)) llvm_unreachable("Invalid Root Signature Metadata."); } @@ -136,22 +154,43 @@ AnalysisKey RootSignatureAnalysis::Key; ModuleRootSignature RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { - return ModuleRootSignature::analyzeModule(M); + auto MMI = AM.getResult(M); + + if (MMI.ShaderProfile == Triple::Library) + return ModuleRootSignature(); + + assert(MMI.EntryPropertyVec.size() == 1); + + const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; + return ModuleRootSignature::analyzeModule(M, EntryFunction); } //===----------------------------------------------------------------------===// bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { - this->MRS = MRS = ModuleRootSignature::analyzeModule(M); + dxil::ModuleMetadataInfo &MMI = + getAnalysis().getModuleMetadata(); + + if (MMI.ShaderProfile == Triple::Library) + return false; + assert(MMI.EntryPropertyVec.size() == 1); + + const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; + this->MRS = MRS = ModuleRootSignature::analyzeModule(M, EntryFunction); return false; } void RootSignatureAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); + AU.addRequired(); } char RootSignatureAnalysisWrapper::ID = 0; -INITIALIZE_PASS(RootSignatureAnalysisWrapper, "dx-root-signature-analysis", - "DXIL Root Signature Analysis", true, true) +INITIALIZE_PASS_BEGIN(RootSignatureAnalysisWrapper, + "dx-root-signature-analysis", + "DXIL Root Signature Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) +INITIALIZE_PASS_END(RootSignatureAnalysisWrapper, "dx-root-signature-analysis", + "DXIL Root Signature Analysis", true, true) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 5bbea29d22ae5..0650ffa7edf41 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -34,9 +34,9 @@ struct ModuleRootSignature { ModuleRootSignature() = default; - bool parse(NamedMDNode *Root); + bool parse(NamedMDNode *Root, const Function *F); - static ModuleRootSignature analyzeModule(Module &M); + static ModuleRootSignature analyzeModule(Module &M, const Function *F); }; class RootSignatureAnalysis : public AnalysisInfoMixin { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index c3e38c44c6194..cf00609a7307e 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -5,7 +5,6 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: @dx.rts0 = private constant [24 x i8] c"{{.*}}", section "RTS0", align 4 - define void @main() #0 { entry: ret void From 979ee915695e9e9503a7a8548db4e22c5b718c27 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 4 Feb 2025 01:17:18 +0000 Subject: [PATCH 168/220] separating parsing and validation --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 37 ++++++++++++------- llvm/lib/Target/DirectX/DXILRootSignature.h | 7 +++- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 984505b3fb85b..c85291186f618 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -29,25 +29,18 @@ static bool reportError(Twine Message) { return true; } -static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { +bool ModuleRootSignature::parseRootFlags(MDNode *RootFlagNode) { if (RootFlagNode->getNumOperands() != 2) return reportError("Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - uint32_t Value = Flag->getZExtValue(); + this->Flags = Flag->getZExtValue(); - // Root Element validation, as specified: - // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation - if ((Value & ~0x80000fff) != 0) - return reportError("Invalid flag value for RootFlag"); - - MRS->Flags = Value; return false; } -static bool parseRootSignatureElement(ModuleRootSignature *MRS, - MDNode *Element) { +bool ModuleRootSignature::parseRootSignatureElement(MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); if (ElementText == nullptr) return reportError("Invalid format for Root Element"); @@ -67,7 +60,7 @@ static bool parseRootSignatureElement(ModuleRootSignature *MRS, switch (ElementKind) { case RootSignatureElementKind::RootFlags: { - return parseRootFlags(MRS, Element); + return parseRootFlags(Element); break; } @@ -131,19 +124,35 @@ bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { if (Element == nullptr) return reportError("Missing Root Element Metadata Node."); - HasError = HasError || parseRootSignatureElement(this, Element); + HasError = HasError || parseRootSignatureElement(Element); } } return HasError; } +bool ModuleRootSignature::validateRootFlag() { + // Root Element validation, as specified: + // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation + if ((Flags & ~0x80000fff) != 0) + return reportError("Invalid flag value for RootFlag"); + + return false; +} + +bool ModuleRootSignature::validate() { + if (validateRootFlag()) + return reportError("Invalid flag value for RootFlag"); + + return false; +} + ModuleRootSignature ModuleRootSignature::analyzeModule(Module &M, const Function *F) { ModuleRootSignature MRS; NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); if (RootSignatureNode) { - if (MRS.parse(RootSignatureNode, F)) + if (MRS.parse(RootSignatureNode, F) || MRS.validate()) llvm_unreachable("Invalid Root Signature Metadata."); } @@ -176,7 +185,7 @@ bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { assert(MMI.EntryPropertyVec.size() == 1); const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - this->MRS = MRS = ModuleRootSignature::analyzeModule(M, EntryFunction); + MRS = ModuleRootSignature::analyzeModule(M, EntryFunction); return false; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 0650ffa7edf41..f79597721c350 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -33,10 +33,15 @@ struct ModuleRootSignature { uint32_t Flags = 0; ModuleRootSignature() = default; + static ModuleRootSignature analyzeModule(Module &M, const Function *F); +private: bool parse(NamedMDNode *Root, const Function *F); + bool parseRootSignatureElement(MDNode *Element); + bool parseRootFlags(MDNode *RootFlagNode); - static ModuleRootSignature analyzeModule(Module &M, const Function *F); + bool validate(); + bool validateRootFlag(); }; class RootSignatureAnalysis : public AnalysisInfoMixin { From d0896744e6e6894dc700fbd9cfe68c3bb8cbd8ee Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 6 Feb 2025 01:32:09 +0000 Subject: [PATCH 169/220] improve error handling --- llvm/include/llvm/BinaryFormat/DXContainer.h | 12 ++---- llvm/lib/Object/DXContainer.cpp | 17 +++----- .../lib/Target/DirectX/DXContainerGlobals.cpp | 10 +++-- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 43 ++++++++----------- llvm/lib/Target/DirectX/DXILRootSignature.h | 27 ++++++++---- .../ContainerData/RootSignature-Error.ll | 2 +- .../RootSignature-Flags-Error.ll | 4 +- .../RootSignature-Flags-Validation-Error.ll | 6 +-- .../RootSignature-RootElement-Error.ll | 2 +- 9 files changed, 61 insertions(+), 62 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index c219aa819795e..7bcf6f2bc7db5 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -550,16 +550,12 @@ static_assert(sizeof(ProgramSignatureElement) == 32, struct RootSignatureValidations { - static Expected validateRootFlag(uint32_t Flags) { - if ((Flags & ~0x80000fff) != 0) - return llvm::make_error("Invalid flag"); - return Flags; + static bool validateRootFlag(uint32_t Flags) { + return (Flags & ~0x80000fff) != 0; } - static Expected validateVersion(uint32_t Version) { - if (Version < 1 || Version > 2) - return llvm::make_error("Invalid Version"); - return Version; + static bool validateVersion(uint32_t Version) { + return (Version < 1 || Version > 2); } }; diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 4b840bae86601..1e9fdf40eeb8f 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -11,6 +11,7 @@ #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; @@ -258,11 +259,9 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - Expected MaybeVersion = - dxbc::RootSignatureValidations::validateVersion(VValue); - if (Error E = MaybeVersion.takeError()) - return E; - Version = MaybeVersion.get(); + if (dxbc::RootSignatureValidations::validateVersion(VValue)) + return make_error("Invalid Version"); + Version = VValue; NumParameters = support::endian::read(Current); @@ -284,11 +283,9 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - Expected MaybeFlag = - dxbc::RootSignatureValidations::validateRootFlag(FValue); - if (Error E = MaybeFlag.takeError()) - return E; - Flags = MaybeFlag.get(); + if (dxbc::RootSignatureValidations::validateRootFlag(FValue)) + return make_error("Invalid flag"); + Flags = FValue; return Error::success(); } diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index abc4bd09fc7c0..a04066ef5c4f0 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -153,16 +153,18 @@ void DXContainerGlobals::addSignature(Module &M, void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { - std::optional MRS = - getAnalysis().getRootSignature(); - if (!MRS.has_value()) + auto &RSA = getAnalysis(); + + if (!RSA.hasRootSignature()) return; + ModuleRootSignature MRS = RSA.getRootSignature(); + SmallString<256> Data; raw_svector_ostream OS(Data); RootSignatureHeader RSH; - RSH.Flags = MRS->Flags; + RSH.Flags = MRS.Flags; RSH.write(OS); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index c85291186f618..f051de8f8c896 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -15,17 +15,23 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" +#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Error.h" +#include using namespace llvm; using namespace llvm::dxil; -static bool reportError(Twine Message) { - report_fatal_error(Message, false); +bool ModuleRootSignature::reportError(Twine Message, + DiagnosticSeverity Severity) { + Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); return true; } @@ -130,43 +136,33 @@ bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { return HasError; } -bool ModuleRootSignature::validateRootFlag() { - // Root Element validation, as specified: - // https://github.com/llvm/wg-hlsl/blob/main/proposals/0002-root-signature-in-clang.md#validations-during-dxil-generation - if ((Flags & ~0x80000fff) != 0) - return reportError("Invalid flag value for RootFlag"); - - return false; -} - bool ModuleRootSignature::validate() { - if (validateRootFlag()) + if (dxbc::RootSignatureValidations::validateRootFlag(Flags)) { return reportError("Invalid flag value for RootFlag"); - + } return false; } -ModuleRootSignature ModuleRootSignature::analyzeModule(Module &M, - const Function *F) { - ModuleRootSignature MRS; +OptionalRootSignature ModuleRootSignature::analyzeModule(Module &M, + const Function *F) { + ModuleRootSignature MRS(&M.getContext()); NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode) { - if (MRS.parse(RootSignatureNode, F) || MRS.validate()) - llvm_unreachable("Invalid Root Signature Metadata."); - } + if (RootSignatureNode == nullptr || MRS.parse(RootSignatureNode, F) || + MRS.validate()) + return std::nullopt; return MRS; } AnalysisKey RootSignatureAnalysis::Key; -ModuleRootSignature RootSignatureAnalysis::run(Module &M, - ModuleAnalysisManager &AM) { +OptionalRootSignature RootSignatureAnalysis::run(Module &M, + ModuleAnalysisManager &AM) { auto MMI = AM.getResult(M); if (MMI.ShaderProfile == Triple::Library) - return ModuleRootSignature(); + return std::nullopt; assert(MMI.EntryPropertyVec.size() == 1); @@ -186,7 +182,6 @@ bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; MRS = ModuleRootSignature::analyzeModule(M, EntryFunction); - return false; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index f79597721c350..da38078ad42f8 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -12,10 +12,13 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include +#include namespace llvm { namespace dxil { @@ -31,19 +34,25 @@ enum class RootSignatureElementKind { struct ModuleRootSignature { uint32_t Flags = 0; - - ModuleRootSignature() = default; - static ModuleRootSignature analyzeModule(Module &M, const Function *F); + ModuleRootSignature() { Ctx = nullptr; }; + ModuleRootSignature(LLVMContext *Ctx) : Ctx(Ctx) {} + static std::optional analyzeModule(Module &M, + const Function *F); private: + LLVMContext *Ctx; + bool parse(NamedMDNode *Root, const Function *F); bool parseRootSignatureElement(MDNode *Element); bool parseRootFlags(MDNode *RootFlagNode); bool validate(); - bool validateRootFlag(); + + bool reportError(Twine Message, DiagnosticSeverity Severity = DS_Error); }; +using OptionalRootSignature = std::optional; + class RootSignatureAnalysis : public AnalysisInfoMixin { friend AnalysisInfoMixin; static AnalysisKey Key; @@ -51,9 +60,9 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { public: RootSignatureAnalysis() = default; - using Result = ModuleRootSignature; + using Result = OptionalRootSignature; - ModuleRootSignature run(Module &M, ModuleAnalysisManager &AM); + OptionalRootSignature run(Module &M, ModuleAnalysisManager &AM); }; /// Wrapper pass for the legacy pass manager. @@ -61,14 +70,16 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { /// This is required because the passes that will depend on this are codegen /// passes which run through the legacy pass manager. class RootSignatureAnalysisWrapper : public ModulePass { - std::optional MRS; +private: + OptionalRootSignature MRS; public: static char ID; RootSignatureAnalysisWrapper() : ModulePass(ID) {} - const std::optional &getRootSignature() { return MRS; } + const ModuleRootSignature &getRootSignature() { return MRS.value(); } + bool hasRootSignature() { return MRS.has_value(); } bool runOnModule(Module &M) override; diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll index cbcd8e56c1c04..0f0c7cc39d73b 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll @@ -2,7 +2,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: LLVM ERROR: Invalid format for Root Signature Definition. Pairs of function, root signature expected. +; CHECK: error: Invalid format for Root Signature Definition. Pairs of function, root signature expected. define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll index 9b4208011bba5..630bd5c1e3836 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -1,8 +1,8 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s +; RUN: not llc %s --filetype=obj -o - target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: LLVM ERROR: Invalid Root Element: NOTRootFlags +; expected-error@-1: Invalid Root Element: NOTRootFlags define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll index 85e6f4d6748d5..dae3c75e70cb8 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll @@ -1,8 +1,6 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s - +; RUN: not llc %s --filetype=obj -o - target triple = "dxil-unknown-shadermodel6.0-compute" - -; CHECK: LLVM ERROR: Invalid flag value for RootFlag +; expected-error@-1: Invalid flag value for RootFlag define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll index 501e3438943a3..80f969e849d25 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll @@ -2,7 +2,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: LLVM ERROR: Missing Root Element Metadata Node. +; CHECK: error: Missing Root Element Metadata Node. define void @main() #0 { From 980e7d92c95c18d4fab9b0da49462636b083ce64 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 6 Feb 2025 18:31:30 +0000 Subject: [PATCH 170/220] clean up --- llvm/lib/Object/DXContainer.cpp | 1 - llvm/lib/Target/DirectX/DXILRootSignature.cpp | 28 ++++++++----------- llvm/lib/Target/DirectX/DXILRootSignature.h | 3 +- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 1e9fdf40eeb8f..79bb1678aee25 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -11,7 +11,6 @@ #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index f051de8f8c896..dd0c7618c728d 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -102,30 +102,26 @@ bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { return reportError("Invalid format for Root Signature Definition. Pairs " "of function, root signature expected."); - Metadata *MD = Node->getOperand(0).get(); - if (auto *VAM = llvm::dyn_cast(MD)) { - llvm::Value *V = VAM->getValue(); - if (Function *F = dyn_cast(V)) { - if (F != EF) - continue; - } else { - return reportError( - "Root Signature MD node, first element is not a function."); - } - } else { - return reportError( - "Root Signature MD node, first element is not a function."); - } + ValueAsMetadata *VAM = + llvm::dyn_cast(Node->getOperand(0).get()); + if (VAM == nullptr) + return reportError("First element of root signature is not a value"); + + Function *F = dyn_cast(VAM->getValue()); + if (F == nullptr) + return reportError("First element of root signature is not a function"); + + if (F != EF) + continue; // Get the Root Signature Description from the function signature pair. MDNode *RS = dyn_cast(Node->getOperand(1).get()); if (RS == nullptr) - return reportError("Missing Root Signature Metadata node."); + return reportError("Missing Root Element List Metadata node."); // Loop through the Root Elements of the root signature. for (unsigned int Eid = 0; Eid < RS->getNumOperands(); Eid++) { - MDNode *Element = dyn_cast(RS->getOperand(Eid)); if (Element == nullptr) return reportError("Missing Root Element Metadata Node."); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index da38078ad42f8..9bb95102952b3 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -35,13 +35,14 @@ enum class RootSignatureElementKind { struct ModuleRootSignature { uint32_t Flags = 0; ModuleRootSignature() { Ctx = nullptr; }; - ModuleRootSignature(LLVMContext *Ctx) : Ctx(Ctx) {} static std::optional analyzeModule(Module &M, const Function *F); private: LLVMContext *Ctx; + ModuleRootSignature(LLVMContext *Ctx) : Ctx(Ctx) {} + bool parse(NamedMDNode *Root, const Function *F); bool parseRootSignatureElement(MDNode *Element); bool parseRootFlags(MDNode *RootFlagNode); From 04667f3f29689d5d4e458f5b1add66dcbf99054d Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 6 Feb 2025 22:04:48 +0000 Subject: [PATCH 171/220] clean up --- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 9bb95102952b3..dd19fd97165ee 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -18,7 +18,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include -#include namespace llvm { namespace dxil { @@ -80,6 +79,7 @@ class RootSignatureAnalysisWrapper : public ModulePass { RootSignatureAnalysisWrapper() : ModulePass(ID) {} const ModuleRootSignature &getRootSignature() { return MRS.value(); } + bool hasRootSignature() { return MRS.has_value(); } bool runOnModule(Module &M) override; From 8ec40aaede302554ea55dd983b62b71db1d201bf Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 6 Feb 2025 22:12:25 +0000 Subject: [PATCH 172/220] formating --- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index dd19fd97165ee..eb3fcbcbc5701 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -79,7 +79,7 @@ class RootSignatureAnalysisWrapper : public ModulePass { RootSignatureAnalysisWrapper() : ModulePass(ID) {} const ModuleRootSignature &getRootSignature() { return MRS.value(); } - + bool hasRootSignature() { return MRS.has_value(); } bool runOnModule(Module &M) override; From b0ac6becb49e6ec87ffae7a16567b1a391f92928 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 7 Feb 2025 18:26:44 +0000 Subject: [PATCH 173/220] addressing comments and fix tests --- llvm/include/llvm/BinaryFormat/DXContainer.h | 13 +++++++++---- llvm/lib/Object/DXContainer.cpp | 6 +++--- llvm/unittests/Object/DXContainerTest.cpp | 18 ++++++++++++------ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 7bcf6f2bc7db5..548760afc08e8 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -550,12 +550,17 @@ static_assert(sizeof(ProgramSignatureElement) == 32, struct RootSignatureValidations { - static bool validateRootFlag(uint32_t Flags) { - return (Flags & ~0x80000fff) != 0; + static Expected validateRootFlag(uint32_t Flags) { + if ((Flags & ~0x80000fff) != 0) + return llvm::make_error("Invalid Root Signature flag"); + return Flags; } - static bool validateVersion(uint32_t Version) { - return (Version < 1 || Version > 2); + static Expected validateVersion(uint32_t Version) { + if (Version < 1 || Version > 2) + return llvm::make_error( + "Invalid Root Signature Version"); + return Version; } }; diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 79bb1678aee25..52c78ecb76ab4 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -250,9 +250,9 @@ Error DirectX::RootSignature::parse(StringRef Data) { const char *Current = Data.begin(); // Root Signature headers expects 6 integers to be present. - if (Data.size() < 6 * sizeof(uint32_t)) { - return parseFailed("Invalid data. Too small."); - } + if (Data.size() < 6 * sizeof(uint32_t)) + return parseFailed( + "Invalid root signature, insufficient space for header."); uint32_t VValue = support::endian::read(Current); diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 8489b05f8b331..88a915f560e05 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -855,8 +855,10 @@ TEST(RootSignature, ParseRootFlags) { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<64>(Buffer)), - FailedWithMessage("Invalid data. Too small.")); + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<64>(Buffer)), + FailedWithMessage( + "Invalid root signature, insufficient space for header.")); } { // Version has been changed to an invalid number. @@ -868,8 +870,10 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), - FailedWithMessage("Invalid Version")); + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Stream Error: An unspecified error has occurred. " + "Invalid Root Signature Version")); } { // Flag has been set to an invalid value @@ -881,7 +885,9 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), - FailedWithMessage("Invalid flag")); + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Stream Error: An unspecified error has occurred. " + "Invalid Root Signature flag")); } } From 6a365038a19978149b4976d1608db3d94dec421c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 7 Feb 2025 19:56:57 +0000 Subject: [PATCH 174/220] formating --- llvm/unittests/Object/DXContainerTest.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 88a915f560e05..bafde7334fbbc 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -870,10 +870,8 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<68>(Buffer)), - FailedWithMessage("Stream Error: An unspecified error has occurred. " - "Invalid Root Signature Version")); + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Invalid Root Signature Version")); } { // Flag has been set to an invalid value @@ -885,9 +883,7 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, }; - EXPECT_THAT_EXPECTED( - DXContainer::create(getMemoryBuffer<68>(Buffer)), - FailedWithMessage("Stream Error: An unspecified error has occurred. " - "Invalid Root Signature flag")); + EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Invalid Root Signature flag")); } } From f7d2c124cc9e11da59bd050d051c53f775a75d88 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 10 Feb 2025 19:21:04 +0000 Subject: [PATCH 175/220] addressing pr comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 6 +-- .../lib/Target/DirectX/DXContainerGlobals.cpp | 6 ++- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 48 +++++++++---------- llvm/lib/Target/DirectX/DXILRootSignature.h | 26 ++-------- .../ContainerData/RootSignature-Flags.ll | 4 -- 5 files changed, 33 insertions(+), 57 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 548760afc08e8..bab475e6d9c68 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -550,11 +550,7 @@ static_assert(sizeof(ProgramSignatureElement) == 32, struct RootSignatureValidations { - static Expected validateRootFlag(uint32_t Flags) { - if ((Flags & ~0x80000fff) != 0) - return llvm::make_error("Invalid Root Signature flag"); - return Flags; - } + static bool validateRootFlag(uint32_t Flags) { return (Flags & ~0xfff) != 0; } static Expected validateVersion(uint32_t Version) { if (Version < 1 || Version > 2) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index a04066ef5c4f0..96c90e46f247d 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include using namespace llvm; using namespace llvm::dxil; @@ -154,11 +155,12 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { auto &RSA = getAnalysis(); + std::optional MaybeRootSignature = RSA.getResult(); - if (!RSA.hasRootSignature()) + if (!MaybeRootSignature.has_value()) return; - ModuleRootSignature MRS = RSA.getRootSignature(); + ModuleRootSignature MRS = MaybeRootSignature.value(); SmallString<256> Data; raw_svector_ostream OS(Data); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index dd0c7618c728d..d6cc18c54a7e8 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -29,24 +29,26 @@ using namespace llvm; using namespace llvm::dxil; -bool ModuleRootSignature::reportError(Twine Message, - DiagnosticSeverity Severity) { +LLVMContext *Ctx; + +static bool reportError(Twine Message, DiagnosticSeverity Severity = DS_Error) { Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); return true; } -bool ModuleRootSignature::parseRootFlags(MDNode *RootFlagNode) { +static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { if (RootFlagNode->getNumOperands() != 2) return reportError("Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - this->Flags = Flag->getZExtValue(); + MRS->Flags = Flag->getZExtValue(); return false; } -bool ModuleRootSignature::parseRootSignatureElement(MDNode *Element) { +static bool parseRootSignatureElement(ModuleRootSignature *MRS, + MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); if (ElementText == nullptr) return reportError("Invalid format for Root Element"); @@ -65,24 +67,21 @@ bool ModuleRootSignature::parseRootSignatureElement(MDNode *Element) { switch (ElementKind) { - case RootSignatureElementKind::RootFlags: { - return parseRootFlags(Element); - break; - } - + case RootSignatureElementKind::RootFlags: + return parseRootFlags(MRS, Element); case RootSignatureElementKind::RootConstants: case RootSignatureElementKind::RootDescriptor: case RootSignatureElementKind::DescriptorTable: case RootSignatureElementKind::StaticSampler: case RootSignatureElementKind::None: return reportError("Invalid Root Element: " + ElementText->getString()); - break; } return true; } -bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { +static bool parse(ModuleRootSignature *MRS, NamedMDNode *Root, + const Function *EF) { bool HasError = false; /** Root Signature are specified as following in the metadata: @@ -93,7 +92,7 @@ bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { So for each MDNode inside dx.rootsignatures NamedMDNode (the Root parameter of this function), the parsing process needs - to loop through each of it's operand and process the pairs function + to loop through each of its operands and process the function, signature pair. */ @@ -126,26 +125,27 @@ bool ModuleRootSignature::parse(NamedMDNode *Root, const Function *EF) { if (Element == nullptr) return reportError("Missing Root Element Metadata Node."); - HasError = HasError || parseRootSignatureElement(Element); + HasError = HasError || parseRootSignatureElement(MRS, Element); } } return HasError; } -bool ModuleRootSignature::validate() { - if (dxbc::RootSignatureValidations::validateRootFlag(Flags)) { - return reportError("Invalid flag value for RootFlag"); +static bool validate(ModuleRootSignature *MRS) { + if (dxbc::RootSignatureValidations::validateRootFlag(MRS->Flags)) { + return reportError("Invalid Root Signature flag value"); } return false; } -OptionalRootSignature ModuleRootSignature::analyzeModule(Module &M, - const Function *F) { - ModuleRootSignature MRS(&M.getContext()); +std::optional +ModuleRootSignature::analyzeModule(Module &M, const Function *F) { + ModuleRootSignature MRS; + Ctx = &M.getContext(); NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr || MRS.parse(RootSignatureNode, F) || - MRS.validate()) + if (RootSignatureNode == nullptr || parse(&MRS, RootSignatureNode, F) || + validate(&MRS)) return std::nullopt; return MRS; @@ -153,8 +153,8 @@ OptionalRootSignature ModuleRootSignature::analyzeModule(Module &M, AnalysisKey RootSignatureAnalysis::Key; -OptionalRootSignature RootSignatureAnalysis::run(Module &M, - ModuleAnalysisManager &AM) { +std::optional +RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { auto MMI = AM.getResult(M); if (MMI.ShaderProfile == Triple::Library) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index eb3fcbcbc5701..ca8801fe6bbef 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -33,26 +33,10 @@ enum class RootSignatureElementKind { struct ModuleRootSignature { uint32_t Flags = 0; - ModuleRootSignature() { Ctx = nullptr; }; static std::optional analyzeModule(Module &M, const Function *F); - -private: - LLVMContext *Ctx; - - ModuleRootSignature(LLVMContext *Ctx) : Ctx(Ctx) {} - - bool parse(NamedMDNode *Root, const Function *F); - bool parseRootSignatureElement(MDNode *Element); - bool parseRootFlags(MDNode *RootFlagNode); - - bool validate(); - - bool reportError(Twine Message, DiagnosticSeverity Severity = DS_Error); }; -using OptionalRootSignature = std::optional; - class RootSignatureAnalysis : public AnalysisInfoMixin { friend AnalysisInfoMixin; static AnalysisKey Key; @@ -60,9 +44,9 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { public: RootSignatureAnalysis() = default; - using Result = OptionalRootSignature; + using Result = std::optional; - OptionalRootSignature run(Module &M, ModuleAnalysisManager &AM); + std::optional run(Module &M, ModuleAnalysisManager &AM); }; /// Wrapper pass for the legacy pass manager. @@ -71,16 +55,14 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { /// passes which run through the legacy pass manager. class RootSignatureAnalysisWrapper : public ModulePass { private: - OptionalRootSignature MRS; + std::optional MRS; public: static char ID; RootSignatureAnalysisWrapper() : ModulePass(ID) {} - const ModuleRootSignature &getRootSignature() { return MRS.value(); } - - bool hasRootSignature() { return MRS.has_value(); } + std::optional getResult() const { return MRS; } bool runOnModule(Module &M) override; diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index cf00609a7307e..3f5bb166ad0e5 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -9,10 +9,6 @@ define void @main() #0 { entry: ret void } - - - - attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } From d6c98edfe21d417c561e081d8d6dc8abde85bd55 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 11 Feb 2025 00:04:13 +0000 Subject: [PATCH 176/220] addressing PR comments --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 79 ++++++++++++------- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 +- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index d6cc18c54a7e8..73da337d9ce0e 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -29,17 +29,17 @@ using namespace llvm; using namespace llvm::dxil; -LLVMContext *Ctx; - -static bool reportError(Twine Message, DiagnosticSeverity Severity = DS_Error) { +static bool reportError(LLVMContext *Ctx, Twine Message, + DiagnosticSeverity Severity = DS_Error) { Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); return true; } -static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { +static bool parseRootFlags(LLVMContext *Ctx, ModuleRootSignature *MRS, + MDNode *RootFlagNode) { if (RootFlagNode->getNumOperands() != 2) - return reportError("Invalid format for RootFlag Element"); + return reportError(Ctx, "Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); MRS->Flags = Flag->getZExtValue(); @@ -47,11 +47,12 @@ static bool parseRootFlags(ModuleRootSignature *MRS, MDNode *RootFlagNode) { return false; } -static bool parseRootSignatureElement(ModuleRootSignature *MRS, +static bool parseRootSignatureElement(LLVMContext *Ctx, + ModuleRootSignature *MRS, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); if (ElementText == nullptr) - return reportError("Invalid format for Root Element"); + return reportError(Ctx, "Invalid format for Root Element"); RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) @@ -68,19 +69,20 @@ static bool parseRootSignatureElement(ModuleRootSignature *MRS, switch (ElementKind) { case RootSignatureElementKind::RootFlags: - return parseRootFlags(MRS, Element); + return parseRootFlags(Ctx, MRS, Element); case RootSignatureElementKind::RootConstants: case RootSignatureElementKind::RootDescriptor: case RootSignatureElementKind::DescriptorTable: case RootSignatureElementKind::StaticSampler: case RootSignatureElementKind::None: - return reportError("Invalid Root Element: " + ElementText->getString()); + return reportError(Ctx, + "Invalid Root Element: " + ElementText->getString()); } return true; } -static bool parse(ModuleRootSignature *MRS, NamedMDNode *Root, +static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, const Function *EF) { bool HasError = false; @@ -97,18 +99,27 @@ static bool parse(ModuleRootSignature *MRS, NamedMDNode *Root, */ for (const MDNode *Node : Root->operands()) { - if (Node->getNumOperands() != 2) - return reportError("Invalid format for Root Signature Definition. Pairs " - "of function, root signature expected."); + if (Node->getNumOperands() != 2) { + HasError = reportError( + Ctx, "Invalid format for Root Signature Definition. Pairs " + "of function, root signature expected."); + continue; + } ValueAsMetadata *VAM = llvm::dyn_cast(Node->getOperand(0).get()); - if (VAM == nullptr) - return reportError("First element of root signature is not a value"); + if (VAM == nullptr) { + HasError = + reportError(Ctx, "First element of root signature is not a value"); + continue; + } Function *F = dyn_cast(VAM->getValue()); - if (F == nullptr) - return reportError("First element of root signature is not a function"); + if (F == nullptr) { + HasError = + reportError(Ctx, "First element of root signature is not a function"); + continue; + } if (F != EF) continue; @@ -116,24 +127,26 @@ static bool parse(ModuleRootSignature *MRS, NamedMDNode *Root, // Get the Root Signature Description from the function signature pair. MDNode *RS = dyn_cast(Node->getOperand(1).get()); - if (RS == nullptr) - return reportError("Missing Root Element List Metadata node."); + if (RS == nullptr) { + reportError(Ctx, "Missing Root Element List Metadata node."); + continue; + } // Loop through the Root Elements of the root signature. for (unsigned int Eid = 0; Eid < RS->getNumOperands(); Eid++) { MDNode *Element = dyn_cast(RS->getOperand(Eid)); if (Element == nullptr) - return reportError("Missing Root Element Metadata Node."); + return reportError(Ctx, "Missing Root Element Metadata Node."); - HasError = HasError || parseRootSignatureElement(MRS, Element); + HasError = HasError || parseRootSignatureElement(Ctx, MRS, Element); } } return HasError; } -static bool validate(ModuleRootSignature *MRS) { +static bool validate(LLVMContext *Ctx, ModuleRootSignature *MRS) { if (dxbc::RootSignatureValidations::validateRootFlag(MRS->Flags)) { - return reportError("Invalid Root Signature flag value"); + return reportError(Ctx, "Invalid Root Signature flag value"); } return false; } @@ -141,11 +154,11 @@ static bool validate(ModuleRootSignature *MRS) { std::optional ModuleRootSignature::analyzeModule(Module &M, const Function *F) { ModuleRootSignature MRS; - Ctx = &M.getContext(); + LLVMContext *Ctx = &M.getContext(); NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr || parse(&MRS, RootSignatureNode, F) || - validate(&MRS)) + if (RootSignatureNode == nullptr || parse(Ctx, &MRS, RootSignatureNode, F) || + validate(Ctx, &MRS)) return std::nullopt; return MRS; @@ -160,7 +173,12 @@ RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { if (MMI.ShaderProfile == Triple::Library) return std::nullopt; - assert(MMI.EntryPropertyVec.size() == 1); + LLVMContext *Ctx = &M.getContext(); + + if (MMI.EntryPropertyVec.size() != 1) { + reportError(Ctx, "More than one entry function defined."); + return std::nullopt; + } const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; return ModuleRootSignature::analyzeModule(M, EntryFunction); @@ -174,7 +192,12 @@ bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { if (MMI.ShaderProfile == Triple::Library) return false; - assert(MMI.EntryPropertyVec.size() == 1); + + LLVMContext *Ctx = &M.getContext(); + if (MMI.EntryPropertyVec.size() != 1) { + reportError(Ctx, "More than one entry function defined."); + return false; + } const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; MRS = ModuleRootSignature::analyzeModule(M, EntryFunction); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index ca8801fe6bbef..ab9a7c3da9a19 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -62,7 +62,7 @@ class RootSignatureAnalysisWrapper : public ModulePass { RootSignatureAnalysisWrapper() : ModulePass(ID) {} - std::optional getResult() const { return MRS; } + const std::optional &getResult() const { return MRS; } bool runOnModule(Module &M) override; From 36746f5b885713bfdaeaf7c7f51266842a063425 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 11 Feb 2025 20:40:00 +0000 Subject: [PATCH 177/220] addressing pr comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 9 +-- llvm/lib/Object/DXContainer.cpp | 8 +-- .../lib/Target/DirectX/DXContainerGlobals.cpp | 8 +-- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 70 ++++++------------- llvm/lib/Target/DirectX/DXILRootSignature.h | 17 ++--- 5 files changed, 37 insertions(+), 75 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index bab475e6d9c68..75b7576c90894 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -550,13 +550,10 @@ static_assert(sizeof(ProgramSignatureElement) == 32, struct RootSignatureValidations { - static bool validateRootFlag(uint32_t Flags) { return (Flags & ~0xfff) != 0; } + static bool isValidRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } - static Expected validateVersion(uint32_t Version) { - if (Version < 1 || Version > 2) - return llvm::make_error( - "Invalid Root Signature Version"); - return Version; + static bool isValidVersion(uint32_t Version) { + return (Version == 1 || Version == 2); } }; diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 52c78ecb76ab4..eb825d6299026 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -258,8 +258,8 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - if (dxbc::RootSignatureValidations::validateVersion(VValue)) - return make_error("Invalid Version"); + if (!dxbc::RootSignatureValidations::isValidVersion(VValue)) + return make_error("Invalid Root Signature Version"); Version = VValue; NumParameters = @@ -282,8 +282,8 @@ Error DirectX::RootSignature::parse(StringRef Data) { support::endian::read(Current); Current += sizeof(uint32_t); - if (dxbc::RootSignatureValidations::validateRootFlag(FValue)) - return make_error("Invalid flag"); + if (!dxbc::RootSignatureValidations::isValidRootFlag(FValue)) + return make_error("Invalid Root Signature flag"); Flags = FValue; return Error::success(); diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 96c90e46f247d..6ba58109ca0d4 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -155,18 +155,14 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { auto &RSA = getAnalysis(); - std::optional MaybeRootSignature = RSA.getResult(); - - if (!MaybeRootSignature.has_value()) + if (!RSA.getResult()) return; - ModuleRootSignature MRS = MaybeRootSignature.value(); - SmallString<256> Data; raw_svector_ostream OS(Data); RootSignatureHeader RSH; - RSH.Flags = MRS.Flags; + RSH.Flags = RSA.getResult()->Flags; RSH.write(OS); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 73da337d9ce0e..b4cde78c3748a 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -57,24 +57,13 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Case("RootConstants", RootSignatureElementKind::RootConstants) - .Case("RootCBV", RootSignatureElementKind::RootDescriptor) - .Case("RootSRV", RootSignatureElementKind::RootDescriptor) - .Case("RootUAV", RootSignatureElementKind::RootDescriptor) - .Case("Sampler", RootSignatureElementKind::RootDescriptor) - .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) - .Case("StaticSampler", RootSignatureElementKind::StaticSampler) .Default(RootSignatureElementKind::None); switch (ElementKind) { case RootSignatureElementKind::RootFlags: return parseRootFlags(Ctx, MRS, Element); - case RootSignatureElementKind::RootConstants: - case RootSignatureElementKind::RootDescriptor: - case RootSignatureElementKind::DescriptorTable: - case RootSignatureElementKind::StaticSampler: - case RootSignatureElementKind::None: + default: return reportError(Ctx, "Invalid Root Element: " + ElementText->getString()); } @@ -83,7 +72,7 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, } static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, - const Function *EF) { + const Function *EntryFunction) { bool HasError = false; /** Root Signature are specified as following in the metadata: @@ -121,7 +110,7 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, continue; } - if (F != EF) + if (F != EntryFunction) continue; // Get the Root Signature Description from the function signature pair. @@ -133,8 +122,8 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, } // Loop through the Root Elements of the root signature. - for (unsigned int Eid = 0; Eid < RS->getNumOperands(); Eid++) { - MDNode *Element = dyn_cast(RS->getOperand(Eid)); + for (const auto &Operand : RS->operands()) { + MDNode *Element = dyn_cast(Operand); if (Element == nullptr) return reportError(Ctx, "Missing Root Element Metadata Node."); @@ -145,20 +134,30 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, } static bool validate(LLVMContext *Ctx, ModuleRootSignature *MRS) { - if (dxbc::RootSignatureValidations::validateRootFlag(MRS->Flags)) { + if (!dxbc::RootSignatureValidations::isValidRootFlag(MRS->Flags)) { return reportError(Ctx, "Invalid Root Signature flag value"); } return false; } std::optional -ModuleRootSignature::analyzeModule(Module &M, const Function *F) { - ModuleRootSignature MRS; +ModuleRootSignature::analyzeModule(Module &M, ModuleMetadataInfo MMI) { + if (MMI.ShaderProfile == Triple::Library) + return std::nullopt; + LLVMContext *Ctx = &M.getContext(); + if (MMI.EntryPropertyVec.size() != 1) { + reportError(Ctx, "More than one entry function defined."); + return std::nullopt; + } + + ModuleRootSignature MRS; + const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr || parse(Ctx, &MRS, RootSignatureNode, F) || - validate(Ctx, &MRS)) + if (RootSignatureNode == nullptr || + parse(Ctx, &MRS, RootSignatureNode, EntryFunction) || validate(Ctx, &MRS)) return std::nullopt; return MRS; @@ -168,20 +167,8 @@ AnalysisKey RootSignatureAnalysis::Key; std::optional RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { - auto MMI = AM.getResult(M); - - if (MMI.ShaderProfile == Triple::Library) - return std::nullopt; - - LLVMContext *Ctx = &M.getContext(); - - if (MMI.EntryPropertyVec.size() != 1) { - reportError(Ctx, "More than one entry function defined."); - return std::nullopt; - } - - const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - return ModuleRootSignature::analyzeModule(M, EntryFunction); + ModuleMetadataInfo MMI = AM.getResult(M); + return ModuleRootSignature::analyzeModule(M, MMI); } //===----------------------------------------------------------------------===// @@ -189,18 +176,7 @@ bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { dxil::ModuleMetadataInfo &MMI = getAnalysis().getModuleMetadata(); - - if (MMI.ShaderProfile == Triple::Library) - return false; - - LLVMContext *Ctx = &M.getContext(); - if (MMI.EntryPropertyVec.size() != 1) { - reportError(Ctx, "More than one entry function defined."); - return false; - } - - const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - MRS = ModuleRootSignature::analyzeModule(M, EntryFunction); + MRS = ModuleRootSignature::analyzeModule(M, MMI); return false; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index ab9a7c3da9a19..638f44a58778a 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -1,5 +1,4 @@ -//===- DXILRootSignature.h - DXIL Root Signature helper objects -//---------------===// +//===- DXILRootSignature.h - DXIL Root Signature helper objects -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,6 +11,7 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" @@ -22,19 +22,12 @@ namespace llvm { namespace dxil { -enum class RootSignatureElementKind { - None = 0, - RootFlags = 1, - RootConstants = 2, - RootDescriptor = 3, - DescriptorTable = 4, - StaticSampler = 5 -}; +enum class RootSignatureElementKind { None = 0, RootFlags = 1 }; struct ModuleRootSignature { uint32_t Flags = 0; - static std::optional analyzeModule(Module &M, - const Function *F); + static std::optional + analyzeModule(Module &M, ModuleMetadataInfo MMI); }; class RootSignatureAnalysis : public AnalysisInfoMixin { From b5208e8622d9509964ecf66585e6b61faf0d7b9c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 11 Feb 2025 20:55:04 +0000 Subject: [PATCH 178/220] removing copies from root signature use in dx container globals --- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 6ba58109ca0d4..029c62b3f5a96 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -155,14 +155,16 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { auto &RSA = getAnalysis(); + if (!RSA.getResult()) return; + const ModuleRootSignature &MRS = RSA.getResult().value(); SmallString<256> Data; raw_svector_ostream OS(Data); RootSignatureHeader RSH; - RSH.Flags = RSA.getResult()->Flags; + RSH.Flags = MRS.Flags; RSH.write(OS); From 1fd656802013c550c7a98d04de3bb8c730dbd965 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 00:48:50 +0000 Subject: [PATCH 179/220] adding more tests --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 41 +++++++++++-------- llvm/lib/Target/DirectX/DXILRootSignature.h | 4 +- .../RootSignature-Flags-Error.ll | 19 --------- ...tSignature-MultipleEntryFunctions-Error.ll | 26 ++++++++++++ .../RootSignature-MultipleEntryFunctions.ll | 29 ++++++------- 5 files changed, 65 insertions(+), 54 deletions(-) delete mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index b4cde78c3748a..7837b4cd2eeb9 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -63,9 +64,6 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, case RootSignatureElementKind::RootFlags: return parseRootFlags(Ctx, MRS, Element); - default: - return reportError(Ctx, - "Invalid Root Element: " + ElementText->getString()); } return true; @@ -95,8 +93,14 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, continue; } + const MDOperand &FunctionPointerMdNode = Node->getOperand(0); + if (FunctionPointerMdNode == nullptr) { + // Function was pruned during compilation. + continue; + } + ValueAsMetadata *VAM = - llvm::dyn_cast(Node->getOperand(0).get()); + llvm::dyn_cast(FunctionPointerMdNode.get()); if (VAM == nullptr) { HasError = reportError(Ctx, "First element of root signature is not a value"); @@ -140,24 +144,26 @@ static bool validate(LLVMContext *Ctx, ModuleRootSignature *MRS) { return false; } -std::optional -ModuleRootSignature::analyzeModule(Module &M, ModuleMetadataInfo MMI) { - if (MMI.ShaderProfile == Triple::Library) - return std::nullopt; +static const Function *getEntryFunction(Module &M, ModuleMetadataInfo MMI) { LLVMContext *Ctx = &M.getContext(); - if (MMI.EntryPropertyVec.size() != 1) { reportError(Ctx, "More than one entry function defined."); - return std::nullopt; + return nullptr; } + return MMI.EntryPropertyVec[0].Entry; +} + +std::optional +ModuleRootSignature::analyzeModule(Module &M, const Function *F) { + + LLVMContext *Ctx = &M.getContext(); ModuleRootSignature MRS; - const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr || - parse(Ctx, &MRS, RootSignatureNode, EntryFunction) || validate(Ctx, &MRS)) + if (RootSignatureNode == nullptr || parse(Ctx, &MRS, RootSignatureNode, F) || + validate(Ctx, &MRS)) return std::nullopt; return MRS; @@ -168,15 +174,18 @@ AnalysisKey RootSignatureAnalysis::Key; std::optional RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { ModuleMetadataInfo MMI = AM.getResult(M); - return ModuleRootSignature::analyzeModule(M, MMI); + if (MMI.ShaderProfile == Triple::Library) + return std::nullopt; + return ModuleRootSignature::analyzeModule(M, getEntryFunction(M, MMI)); } //===----------------------------------------------------------------------===// bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { - dxil::ModuleMetadataInfo &MMI = getAnalysis().getModuleMetadata(); - MRS = ModuleRootSignature::analyzeModule(M, MMI); + if (MMI.ShaderProfile == Triple::Library) + return false; + MRS = ModuleRootSignature::analyzeModule(M, getEntryFunction(M, MMI)); return false; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 638f44a58778a..d97c666f76e9e 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -26,8 +26,8 @@ enum class RootSignatureElementKind { None = 0, RootFlags = 1 }; struct ModuleRootSignature { uint32_t Flags = 0; - static std::optional - analyzeModule(Module &M, ModuleMetadataInfo MMI); + static std::optional analyzeModule(Module &M, + const Function *F); }; class RootSignatureAnalysis : public AnalysisInfoMixin { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll deleted file mode 100644 index 630bd5c1e3836..0000000000000 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: not llc %s --filetype=obj -o - - -target triple = "dxil-unknown-shadermodel6.0-compute" - -; expected-error@-1: Invalid Root Element: NOTRootFlags - - -define void @main() #0 { -entry: - ret void -} - -attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } - - -!dx.rootsignatures = !{!2} ; list of function/root signature pairs -!2 = !{ ptr @main, !3 } ; function, root signature -!3 = !{ !4 } ; list of root signature elements -!4 = !{ !"NOTRootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll new file mode 100644 index 0000000000000..dd3fbe810d846 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll @@ -0,0 +1,26 @@ +; RUN: not --crash llc %s --filetype=obj -o - 2>&1 | FileCheck %s +; CHECK: error: More than one entry function defined + +target triple = "dxil-unknown-shadermodel6.0-compute" + + +define void @main() #0 { +entry: + ret void +} + +define void @anotherMain() #1 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } +attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs +!2 = !{ ptr @main, !3 } ; function, root signature +!3 = !{ !4 } ; list of root signature elements +!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout +!5 = !{ ptr @anotherMain, !6 } ; function, root signature +!6 = !{ !7 } ; list of root signature elements +!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index 7adb17d0b022f..eb3c738a5fc03 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -1,9 +1,9 @@ -; RUN: opt -passes='print' %s -S -o - 2>&1 | FileCheck %s +; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" -define void @main() #0 { +define void @main() { entry: ret void } @@ -23,18 +23,13 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !6 = !{ !7 } ; list of root signature elements !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout -; CHECK-LABEL: Definition for 'main': -; CHECK-NEXT: Flags: 0x000001 -; CHECK-NEXT: Version: 2 -; CHECK-NEXT: NumParameters: 0 -; CHECK-NEXT: RootParametersOffset: 0 -; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 - -; CHECK-LABEL: Definition for 'anotherMain': -; CHECK-NEXT: Flags: 0x000002 -; CHECK-NEXT: Version: 2 -; CHECK-NEXT: NumParameters: 0 -; CHECK-NEXT: RootParametersOffset: 0 -; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 + +; CHECK: - Name: RTS0 +; CHECK-NEXT: Size: 24 +; CHECK-NEXT: RootSignature: +; CHECK-NEXT: Version: 2 +; CHECK-NEXT: NumParameters: 0 +; CHECK-NEXT: RootParametersOffset: 0 +; CHECK-NEXT: NumStaticSamplers: 0 +; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: DenyVertexShaderRootAccess: true From cde46348c06486214b5f20f78cc9ba15e96d1708 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 01:03:40 +0000 Subject: [PATCH 180/220] maybe fix test? --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 3 +-- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 +- .../RootSignature-MultipleEntryFunctions-Error.ll | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 7837b4cd2eeb9..73b6dd96a9fa0 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -57,8 +57,7 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) - .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Default(RootSignatureElementKind::None); + .Case("RootFlags", RootSignatureElementKind::RootFlags); switch (ElementKind) { diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index d97c666f76e9e..427593c89418b 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -22,7 +22,7 @@ namespace llvm { namespace dxil { -enum class RootSignatureElementKind { None = 0, RootFlags = 1 }; +enum class RootSignatureElementKind { RootFlags = 1 }; struct ModuleRootSignature { uint32_t Flags = 0; diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll index dd3fbe810d846..033661ae9a0bb 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll @@ -1,5 +1,5 @@ -; RUN: not --crash llc %s --filetype=obj -o - 2>&1 | FileCheck %s -; CHECK: error: More than one entry function defined +; RUN: not --crash llc %s --filetype=obj -o - +; expected-error@-1: More than one entry function defined target triple = "dxil-unknown-shadermodel6.0-compute" From 4410e7bf2d3ba102e9bc394e015ffb89a1359ab2 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 01:35:12 +0000 Subject: [PATCH 181/220] try fix format --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 6 +++++- llvm/lib/Target/DirectX/DXILRootSignature.h | 2 +- .../RootSignature-Flags-Error.ll | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 73b6dd96a9fa0..3a80a938e8ec2 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -57,12 +57,16 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) - .Case("RootFlags", RootSignatureElementKind::RootFlags); + .Case("RootFlags", RootSignatureElementKind::RootFlags) + .Default(RootSignatureElementKind::None); switch (ElementKind) { case RootSignatureElementKind::RootFlags: return parseRootFlags(Ctx, MRS, Element); + case RootSignatureElementKind::None: + return reportError(Ctx, + "Invalid Root Element: " + ElementText->getString()); } return true; diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 427593c89418b..d97c666f76e9e 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -22,7 +22,7 @@ namespace llvm { namespace dxil { -enum class RootSignatureElementKind { RootFlags = 1 }; +enum class RootSignatureElementKind { None = 0, RootFlags = 1 }; struct ModuleRootSignature { uint32_t Flags = 0; diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll new file mode 100644 index 0000000000000..630bd5c1e3836 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -0,0 +1,19 @@ +; RUN: not llc %s --filetype=obj -o - + +target triple = "dxil-unknown-shadermodel6.0-compute" + +; expected-error@-1: Invalid Root Element: NOTRootFlags + + +define void @main() #0 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + + +!dx.rootsignatures = !{!2} ; list of function/root signature pairs +!2 = !{ ptr @main, !3 } ; function, root signature +!3 = !{ !4 } ; list of root signature elements +!4 = !{ !"NOTRootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout From cbdb114c81ac1a2be3681ec0ff9f24a0aa902318 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 01:47:06 +0000 Subject: [PATCH 182/220] removing test --- ...tSignature-MultipleEntryFunctions-Error.ll | 26 ------------------- 1 file changed, 26 deletions(-) delete mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll deleted file mode 100644 index 033661ae9a0bb..0000000000000 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: not --crash llc %s --filetype=obj -o - -; expected-error@-1: More than one entry function defined - -target triple = "dxil-unknown-shadermodel6.0-compute" - - -define void @main() #0 { -entry: - ret void -} - -define void @anotherMain() #1 { -entry: - ret void -} - -attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } -attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } - -!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs -!2 = !{ ptr @main, !3 } ; function, root signature -!3 = !{ !4 } ; list of root signature elements -!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout -!5 = !{ ptr @anotherMain, !6 } ; function, root signature -!6 = !{ !7 } ; list of root signature elements -!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout From 667ee17558f5da9b66a83c23643355e091de94f3 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 04:19:58 +0000 Subject: [PATCH 183/220] adding llvm unreachable and testing test --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 2 +- ...tSignature-MultipleEntryFunctions-Error.ll | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 3a80a938e8ec2..075925845d58b 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -69,7 +69,7 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, "Invalid Root Element: " + ElementText->getString()); } - return true; + llvm_unreachable("Root signature element kind not expected."); } static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll new file mode 100644 index 0000000000000..6df70632730b7 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll @@ -0,0 +1,26 @@ +; RUN: not llc %s --filetype=obj -o - +; expected-error@-1: More than one entry function defined + +target triple = "dxil-unknown-shadermodel6.0-compute" + + +define void @main() #0 { +entry: + ret void +} + +define void @anotherMain() #1 { +entry: + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } +attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + +!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs +!2 = !{ ptr @main, !3 } ; function, root signature +!3 = !{ !4 } ; list of root signature elements +!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout +!5 = !{ ptr @anotherMain, !6 } ; function, root signature +!6 = !{ !7 } ; list of root signature elements +!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout From d0dae8b69a8d4e1ba726d33898a52c0a7ca2ea39 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 06:57:10 +0000 Subject: [PATCH 184/220] stopping compilation if root signature error were emitted --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 075925845d58b..afa7e327f6292 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -25,6 +25,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include using namespace llvm; @@ -152,6 +153,8 @@ static const Function *getEntryFunction(Module &M, ModuleMetadataInfo MMI) { LLVMContext *Ctx = &M.getContext(); if (MMI.EntryPropertyVec.size() != 1) { reportError(Ctx, "More than one entry function defined."); + // needed to stop compilation + report_fatal_error("Invalid Root Signature Definition", false); return nullptr; } return MMI.EntryPropertyVec[0].Entry; @@ -165,10 +168,15 @@ ModuleRootSignature::analyzeModule(Module &M, const Function *F) { ModuleRootSignature MRS; NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr || parse(Ctx, &MRS, RootSignatureNode, F) || - validate(Ctx, &MRS)) + if (RootSignatureNode == nullptr) return std::nullopt; + if (parse(Ctx, &MRS, RootSignatureNode, F) || validate(Ctx, &MRS)) { + // needed to stop compilation + report_fatal_error("Invalid Root Signature Definition", false); + return std::nullopt; + } + return MRS; } From b1b28f86124918bffdaab765e1f02642daf0a918 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 17:37:32 +0000 Subject: [PATCH 185/220] making sure Error tests fail --- .../DirectX/ContainerData/RootSignature-Flags-Error.ll | 4 ++-- .../ContainerData/RootSignature-Flags-Validation-Error.ll | 6 ++++-- .../RootSignature-MultipleEntryFunctions-Error.ll | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll index 630bd5c1e3836..31e9db6f36e5e 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -1,8 +1,8 @@ -; RUN: not llc %s --filetype=obj -o - +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" -; expected-error@-1: Invalid Root Element: NOTRootFlags +; CHECK: error: Invalid Root Element: NOTRootFlags define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll index dae3c75e70cb8..43c9ecbd36570 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll @@ -1,6 +1,8 @@ -; RUN: not llc %s --filetype=obj -o - +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s + +; CHECK: error: Invalid Root Signature flag value + target triple = "dxil-unknown-shadermodel6.0-compute" -; expected-error@-1: Invalid flag value for RootFlag define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll index 6df70632730b7..d6e2030cb6fda 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll @@ -1,5 +1,6 @@ -; RUN: not llc %s --filetype=obj -o - -; expected-error@-1: More than one entry function defined +; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s + +; CHECK: error: More than one entry function defined target triple = "dxil-unknown-shadermodel6.0-compute" From 0efd8cc149e7a77a7935a34974b9a503e1a45f68 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 19:45:54 +0000 Subject: [PATCH 186/220] adding root constants --- llvm/include/llvm/BinaryFormat/DXContainer.h | 14 ++++++++++++ .../BinaryFormat/DXContainerConstants.def | 17 ++++++++++++++ .../llvm/MC/DXContainerRootSignature.h | 22 +++++++++++++++++++ llvm/lib/MC/DXContainerRootSignature.cpp | 19 +++++++++++++++- 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 75b7576c90894..0f495f5935995 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -159,6 +159,20 @@ enum class RootElementFlag : uint32_t { #include "DXContainerConstants.def" }; +#define ROOT_PARAMETER(Val, Enum) Enum = Val, +enum class RootParameterType : uint8_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getRootParameterTypes(); + +#define SHADER_VISIBILITY(Val, Enum) Enum = Val, +enum class ShaderVisibilityFlag : uint8_t { +#include "DXContainerConstants.def" +}; + +ArrayRef> getShaderVisibilityFlags(); + PartType parsePartType(StringRef S); struct VertexPSVInfo { diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 6d44ea14df444..e978d438a5f1c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -72,6 +72,23 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) #undef ROOT_ELEMENT_FLAG #endif // ROOT_ELEMENT_FLAG +#ifdef ROOT_PARAMETER + +ROOT_PARAMETER(1, Constants32Bit) +#undef ROOT_PARAMETER +#endif // ROOT_PARAMETER + +#ifdef SHADER_VISIBILITY +SHADER_VISIBILITY(0, All) +SHADER_VISIBILITY(1, Vertex) +SHADER_VISIBILITY(2, Hull) +SHADER_VISIBILITY(3, Domain) +SHADER_VISIBILITY(4, Geometry) +SHADER_VISIBILITY(5, Pixel) +SHADER_VISIBILITY(6, Amplification) +SHADER_VISIBILITY(7, Mesh) +#undef SHADER_VISIBILITY +#endif // SHADER_VISIBILITY #ifdef DXIL_MODULE_FLAG diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index e1a9be5fc52d8..08b87c6ca97a9 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/BinaryFormat/DXContainer.h" #include #include @@ -14,6 +15,9 @@ namespace llvm { class raw_ostream; namespace mcdxbc { + + + struct RootSignatureHeader { uint32_t Version = 2; uint32_t NumParameters = 0; @@ -24,5 +28,23 @@ struct RootSignatureHeader { void write(raw_ostream &OS); }; + +struct RootConstants { + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; + + void write(raw_ostream &OS); +}; + +struct RootParameter { + dxbc::RootParameterType ParameterType; + union { + RootConstants Constants; + }; + dxbc::ShaderVisibilityFlag ShaderVisibility; + + void write(raw_ostream &OS); +}; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 000d23f24d241..2b0ee368f7946 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" +#include "llvm/ADT/bit.h" #include "llvm/Support/EndianStream.h" using namespace llvm; using namespace llvm::mcdxbc; void RootSignatureHeader::write(raw_ostream &OS) { - support::endian::write(OS, Version, llvm::endianness::little); support::endian::write(OS, NumParameters, llvm::endianness::little); support::endian::write(OS, RootParametersOffset, llvm::endianness::little); @@ -21,3 +21,20 @@ void RootSignatureHeader::write(raw_ostream &OS) { support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); support::endian::write(OS, Flags, llvm::endianness::little); } + +void RootParameter::write(raw_ostream &OS) { + support::endian::write(OS, ParameterType, llvm::endianness::little); + support::endian::write(OS, ShaderVisibility, llvm::endianness::little); + + switch(ParameterType){ + case dxbc::RootParameterType::Constants32Bit: + Constants.write(OS); + break; + } +} + +void RootConstants::write(raw_ostream &OS) { + support::endian::write(OS, Num32BitValues, llvm::endianness::little); + support::endian::write(OS, RegisterSpace, llvm::endianness::little); + support::endian::write(OS, ShaderRegister, llvm::endianness::little); +} From 11256d871ac47e6cbc2a4fd1518aa576f75fdacb Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 22:35:05 +0000 Subject: [PATCH 187/220] refactoring root signature analysis to return a map instead --- .../lib/Target/DirectX/DXContainerGlobals.cpp | 15 +- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 161 ++++++++++-------- llvm/lib/Target/DirectX/DXILRootSignature.h | 28 ++- .../ContainerData/RootSignature-Error.ll | 3 +- .../RootSignature-Flags-Error.ll | 3 +- .../RootSignature-Flags-Validation-Error.ll | 3 +- ...tSignature-MultipleEntryFunctions-Error.ll | 27 --- .../RootSignature-MultipleEntryFunctions.ll | 18 +- .../RootSignature-RootElement-Error.ll | 3 +- 9 files changed, 136 insertions(+), 125 deletions(-) delete mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 029c62b3f5a96..09be5e3aff330 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -154,12 +154,23 @@ void DXContainerGlobals::addSignature(Module &M, void DXContainerGlobals::addRootSignature(Module &M, SmallVector &Globals) { + dxil::ModuleMetadataInfo &MMI = + getAnalysis().getModuleMetadata(); + + // Root Signature in Library shaders are different, + // since they don't use DXContainer to share it. + if (MMI.ShaderProfile == llvm::Triple::Library) + return; + + assert(MMI.EntryPropertyVec.size() == 1); + auto &RSA = getAnalysis(); + const Function *&EntryFunction = MMI.EntryPropertyVec[0].Entry; - if (!RSA.getResult()) + if (!RSA.hasForFunction(EntryFunction)) return; - const ModuleRootSignature &MRS = RSA.getResult().value(); + const ModuleRootSignature &MRS = RSA.getForFunction(EntryFunction); SmallString<256> Data; raw_svector_ostream OS(Data); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index afa7e327f6292..cf474a4ba23cb 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -26,7 +26,9 @@ #include "llvm/Pass.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include #include +#include using namespace llvm; using namespace llvm::dxil; @@ -37,20 +39,20 @@ static bool reportError(LLVMContext *Ctx, Twine Message, return true; } -static bool parseRootFlags(LLVMContext *Ctx, ModuleRootSignature *MRS, +static bool parseRootFlags(LLVMContext *Ctx, ModuleRootSignature &MRS, MDNode *RootFlagNode) { if (RootFlagNode->getNumOperands() != 2) return reportError(Ctx, "Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - MRS->Flags = Flag->getZExtValue(); + MRS.Flags = Flag->getZExtValue(); return false; } static bool parseRootSignatureElement(LLVMContext *Ctx, - ModuleRootSignature *MRS, + ModuleRootSignature &MRS, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); if (ElementText == nullptr) @@ -73,8 +75,7 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, llvm_unreachable("Root signature element kind not expected."); } -static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, - const Function *EntryFunction) { +static bool parse(LLVMContext *Ctx, ModuleRootSignature &MRS, MDNode *Node) { bool HasError = false; /** Root Signature are specified as following in the metadata: @@ -89,15 +90,46 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, signature pair. */ - for (const MDNode *Node : Root->operands()) { - if (Node->getNumOperands() != 2) { - HasError = reportError( - Ctx, "Invalid format for Root Signature Definition. Pairs " - "of function, root signature expected."); + // Get the Root Signature Description from the function signature pair. + + // Loop through the Root Elements of the root signature. + for (const auto &Operand : Node->operands()) { + MDNode *Element = dyn_cast(Operand); + if (Element == nullptr) + return reportError(Ctx, "Missing Root Element Metadata Node."); + + HasError = HasError || parseRootSignatureElement(Ctx, MRS, Element); + } + + return HasError; +} + +static bool validate(LLVMContext *Ctx, const ModuleRootSignature &MRS) { + if (!dxbc::RootSignatureValidations::isValidRootFlag(MRS.Flags)) { + return reportError(Ctx, "Invalid Root Signature flag value"); + } + return false; +} + +static SmallDenseMap +analyzeModule(Module &M) { + + LLVMContext *Ctx = &M.getContext(); + + SmallDenseMap MRSMap; + + NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); + if (RootSignatureNode == nullptr) + return MRSMap; + + for (const auto &RSDefNode : RootSignatureNode->operands()) { + if (RSDefNode->getNumOperands() != 2) { + reportError(Ctx, "Invalid format for Root Signature Definition. Pairs " + "of function, root signature expected."); continue; } - const MDOperand &FunctionPointerMdNode = Node->getOperand(0); + const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0); if (FunctionPointerMdNode == nullptr) { // Function was pruned during compilation. continue; @@ -106,97 +138,76 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature *MRS, NamedMDNode *Root, ValueAsMetadata *VAM = llvm::dyn_cast(FunctionPointerMdNode.get()); if (VAM == nullptr) { - HasError = - reportError(Ctx, "First element of root signature is not a value"); + reportError(Ctx, "First element of root signature is not a value"); continue; } Function *F = dyn_cast(VAM->getValue()); if (F == nullptr) { - HasError = - reportError(Ctx, "First element of root signature is not a function"); + reportError(Ctx, "First element of root signature is not a function"); continue; } - if (F != EntryFunction) - continue; + MDNode *RootElementListNode = + dyn_cast(RSDefNode->getOperand(1).get()); - // Get the Root Signature Description from the function signature pair. - MDNode *RS = dyn_cast(Node->getOperand(1).get()); - - if (RS == nullptr) { + if (RootElementListNode == nullptr) { reportError(Ctx, "Missing Root Element List Metadata node."); - continue; } - // Loop through the Root Elements of the root signature. - for (const auto &Operand : RS->operands()) { - MDNode *Element = dyn_cast(Operand); - if (Element == nullptr) - return reportError(Ctx, "Missing Root Element Metadata Node."); + ModuleRootSignature MRS; - HasError = HasError || parseRootSignatureElement(Ctx, MRS, Element); + if (parse(Ctx, MRS, RootElementListNode) || validate(Ctx, MRS)) { + return MRSMap; } - } - return HasError; -} -static bool validate(LLVMContext *Ctx, ModuleRootSignature *MRS) { - if (!dxbc::RootSignatureValidations::isValidRootFlag(MRS->Flags)) { - return reportError(Ctx, "Invalid Root Signature flag value"); + MRSMap.insert(std::make_pair(F, MRS)); } - return false; -} -static const Function *getEntryFunction(Module &M, ModuleMetadataInfo MMI) { - - LLVMContext *Ctx = &M.getContext(); - if (MMI.EntryPropertyVec.size() != 1) { - reportError(Ctx, "More than one entry function defined."); - // needed to stop compilation - report_fatal_error("Invalid Root Signature Definition", false); - return nullptr; - } - return MMI.EntryPropertyVec[0].Entry; + return MRSMap; } -std::optional -ModuleRootSignature::analyzeModule(Module &M, const Function *F) { - - LLVMContext *Ctx = &M.getContext(); +AnalysisKey RootSignatureAnalysis::Key; - ModuleRootSignature MRS; +SmallDenseMap +RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { + return analyzeModule(M); +} - NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); - if (RootSignatureNode == nullptr) - return std::nullopt; +//===----------------------------------------------------------------------===// - if (parse(Ctx, &MRS, RootSignatureNode, F) || validate(Ctx, &MRS)) { - // needed to stop compilation - report_fatal_error("Invalid Root Signature Definition", false); - return std::nullopt; +static void printSpaces(raw_ostream &Stream, unsigned int Count) { + for (unsigned int I = 0; I < Count; ++I) { + Stream << ' '; } - - return MRS; } -AnalysisKey RootSignatureAnalysis::Key; +PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, + ModuleAnalysisManager &AM) { + + SmallDenseMap &MRSMap = + AM.getResult(M); + OS << "Root Signature Definitions" + << "\n"; + uint8_t Space = 0; + for (const auto &P : MRSMap) { + const auto &[Function, MRS] = P; + OS << "Definition for '" << Function->getName() << "':\n"; + + // start root signature header + Space++; + printSpaces(OS, Space); + OS << "Flags: " << format_hex(MRS.Flags, 8) << ":\n"; + Space--; + // end root signature header + } -std::optional -RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { - ModuleMetadataInfo MMI = AM.getResult(M); - if (MMI.ShaderProfile == Triple::Library) - return std::nullopt; - return ModuleRootSignature::analyzeModule(M, getEntryFunction(M, MMI)); + return PreservedAnalyses::all(); } //===----------------------------------------------------------------------===// bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { - dxil::ModuleMetadataInfo &MMI = - getAnalysis().getModuleMetadata(); - if (MMI.ShaderProfile == Triple::Library) - return false; - MRS = ModuleRootSignature::analyzeModule(M, getEntryFunction(M, MMI)); + MRS = analyzeModule(M); return false; } @@ -208,8 +219,8 @@ void RootSignatureAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { char RootSignatureAnalysisWrapper::ID = 0; INITIALIZE_PASS_BEGIN(RootSignatureAnalysisWrapper, - "dx-root-signature-analysis", + "dxil-root-signature-analysis", "DXIL Root Signature Analysis", true, true) -INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) -INITIALIZE_PASS_END(RootSignatureAnalysisWrapper, "dx-root-signature-analysis", +INITIALIZE_PASS_END(RootSignatureAnalysisWrapper, + "dxil-root-signature-analysis", "DXIL Root Signature Analysis", true, true) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index d97c666f76e9e..992041696c557 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -11,6 +11,7 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Metadata.h" @@ -25,9 +26,8 @@ namespace dxil { enum class RootSignatureElementKind { None = 0, RootFlags = 1 }; struct ModuleRootSignature { + ModuleRootSignature() = default; uint32_t Flags = 0; - static std::optional analyzeModule(Module &M, - const Function *F); }; class RootSignatureAnalysis : public AnalysisInfoMixin { @@ -37,9 +37,10 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { public: RootSignatureAnalysis() = default; - using Result = std::optional; + using Result = SmallDenseMap; - std::optional run(Module &M, ModuleAnalysisManager &AM); + SmallDenseMap + run(Module &M, ModuleAnalysisManager &AM); }; /// Wrapper pass for the legacy pass manager. @@ -48,19 +49,34 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { /// passes which run through the legacy pass manager. class RootSignatureAnalysisWrapper : public ModulePass { private: - std::optional MRS; + SmallDenseMap MRS; public: static char ID; RootSignatureAnalysisWrapper() : ModulePass(ID) {} - const std::optional &getResult() const { return MRS; } + bool hasForFunction(const Function *F) { return MRS.find(F) != MRS.end(); } + + ModuleRootSignature getForFunction(const Function *F) { + assert(hasForFunction(F)); + return MRS[F]; + } bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override; }; +/// Printer pass for RootSignatureAnalysis results. +class RootSignatureAnalysisPrinter + : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit RootSignatureAnalysisPrinter(raw_ostream &OS) : OS(OS) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + } // namespace dxil } // namespace llvm diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll index 0f0c7cc39d73b..44a012c92c9d7 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll @@ -1,8 +1,9 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s +; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Invalid format for Root Signature Definition. Pairs of function, root signature expected. +; CHECK-NO: Root Signature Definitions define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll index 31e9db6f36e5e..d839c9f825d1d 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -1,8 +1,9 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s +; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Invalid Root Element: NOTRootFlags +; CHECK-NO: Root Signature Definitions define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll index 43c9ecbd36570..0a2fb552af278 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll @@ -1,6 +1,7 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s +; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s ; CHECK: error: Invalid Root Signature flag value +; CHECK-NO: Root Signature Definitions target triple = "dxil-unknown-shadermodel6.0-compute" diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll deleted file mode 100644 index d6e2030cb6fda..0000000000000 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions-Error.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s - -; CHECK: error: More than one entry function defined - -target triple = "dxil-unknown-shadermodel6.0-compute" - - -define void @main() #0 { -entry: - ret void -} - -define void @anotherMain() #1 { -entry: - ret void -} - -attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } -attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } - -!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs -!2 = !{ ptr @main, !3 } ; function, root signature -!3 = !{ !4 } ; list of root signature elements -!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout -!5 = !{ ptr @anotherMain, !6 } ; function, root signature -!6 = !{ !7 } ; list of root signature elements -!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index eb3c738a5fc03..f81927d9229f9 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -1,9 +1,9 @@ -; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s +; RUN: opt -passes='print' %s -S -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" -define void @main() { +define void @main() #0 { entry: ret void } @@ -24,12 +24,8 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout -; CHECK: - Name: RTS0 -; CHECK-NEXT: Size: 24 -; CHECK-NEXT: RootSignature: -; CHECK-NEXT: Version: 2 -; CHECK-NEXT: NumParameters: 0 -; CHECK-NEXT: RootParametersOffset: 0 -; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 -; CHECK-NEXT: DenyVertexShaderRootAccess: true +; CHECK: Root Signature Definitions +; CHECK-NEXT: Definition for 'main': +; CHECK-NEXT: Flags: 0x000001: +; CHECK-NEXT: Definition for 'anotherMain': +; CHECK-NEXT: Flags: 0x000002: diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll index 80f969e849d25..3680c162a4f5c 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll @@ -1,8 +1,9 @@ -; RUN: not llc %s --filetype=obj -o - 2>&1 | FileCheck %s +; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Missing Root Element Metadata Node. +; CHECK-NO: Root Signature Definitions define void @main() #0 { From 3c5046e7847e2f2153b67c9c5f44550d783984fd Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 23:27:11 +0000 Subject: [PATCH 188/220] addressing pr comments --- llvm/lib/Object/DXContainer.cpp | 8 +++- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 38 +++++++++---------- llvm/lib/Target/DirectX/DXILRootSignature.h | 1 - .../RootSignature-Flags-Error.ll | 2 +- llvm/unittests/Object/DXContainerTest.cpp | 11 ++++-- 5 files changed, 32 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index eb825d6299026..30a8d5264e86e 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -7,10 +7,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/DXContainer.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; @@ -259,7 +261,8 @@ Error DirectX::RootSignature::parse(StringRef Data) { Current += sizeof(uint32_t); if (!dxbc::RootSignatureValidations::isValidVersion(VValue)) - return make_error("Invalid Root Signature Version"); + return validationFailed("unsupported root signature version read: " + + llvm::Twine(VValue)); Version = VValue; NumParameters = @@ -283,7 +286,8 @@ Error DirectX::RootSignature::parse(StringRef Data) { Current += sizeof(uint32_t); if (!dxbc::RootSignatureValidations::isValidRootFlag(FValue)) - return make_error("Invalid Root Signature flag"); + return validationFailed("unsupported root signature flag value read: " + + llvm::Twine(FValue)); Flags = FValue; return Error::success(); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index cf474a4ba23cb..ecca7abfe3e99 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -1,4 +1,4 @@ -//===- DXILRootSignature.cpp - DXIL Root Signature helper objects ----===// +//===- DXILRootSignature.cpp - DXIL Root Signature helper objects -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -68,8 +68,8 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, case RootSignatureElementKind::RootFlags: return parseRootFlags(Ctx, MRS, Element); case RootSignatureElementKind::None: - return reportError(Ctx, - "Invalid Root Element: " + ElementText->getString()); + return reportError(Ctx, "Invalid Root Signature Element: " + + ElementText->getString()); } llvm_unreachable("Root signature element kind not expected."); @@ -78,20 +78,6 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, static bool parse(LLVMContext *Ctx, ModuleRootSignature &MRS, MDNode *Node) { bool HasError = false; - /** Root Signature are specified as following in the metadata: - - !dx.rootsignatures = !{!2} ; list of function/root signature pairs - !2 = !{ ptr @main, !3 } ; function, root signature - !3 = !{ !4, !5, !6, !7 } ; list of root signature elements - - So for each MDNode inside dx.rootsignatures NamedMDNode - (the Root parameter of this function), the parsing process needs - to loop through each of its operands and process the function, - signature pair. - */ - - // Get the Root Signature Description from the function signature pair. - // Loop through the Root Elements of the root signature. for (const auto &Operand : Node->operands()) { MDNode *Element = dyn_cast(Operand); @@ -114,6 +100,18 @@ static bool validate(LLVMContext *Ctx, const ModuleRootSignature &MRS) { static SmallDenseMap analyzeModule(Module &M) { + /** Root Signature are specified as following in the metadata: + + !dx.rootsignatures = !{!2} ; list of function/root signature pairs + !2 = !{ ptr @main, !3 } ; function, root signature + !3 = !{ !4, !5, !6, !7 } ; list of root signature elements + + So for each MDNode inside dx.rootsignatures NamedMDNode + (the Root parameter of this function), the parsing process needs + to loop through each of its operands and process the function, + signature pair. + */ + LLVMContext *Ctx = &M.getContext(); SmallDenseMap MRSMap; @@ -129,22 +127,22 @@ analyzeModule(Module &M) { continue; } + // Function was pruned during compilation. const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0); if (FunctionPointerMdNode == nullptr) { - // Function was pruned during compilation. continue; } ValueAsMetadata *VAM = llvm::dyn_cast(FunctionPointerMdNode.get()); if (VAM == nullptr) { - reportError(Ctx, "First element of root signature is not a value"); + reportError(Ctx, "First element of root signature is not a Value"); continue; } Function *F = dyn_cast(VAM->getValue()); if (F == nullptr) { - reportError(Ctx, "First element of root signature is not a function"); + reportError(Ctx, "First element of root signature is not a Function"); continue; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 992041696c557..95970f2048767 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -18,7 +18,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" -#include namespace llvm { namespace dxil { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll index d839c9f825d1d..25599d4d345b9 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -2,7 +2,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" -; CHECK: error: Invalid Root Element: NOTRootFlags +; CHECK: error: Invalid Root Signature Element: NOTRootFlags ; CHECK-NO: Root Signature Definitions diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index bafde7334fbbc..943022bb4469b 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -870,8 +870,9 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), - FailedWithMessage("Invalid Root Signature Version")); + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<100>(Buffer)), + FailedWithMessage("unsupported root signature version read: 3")); } { // Flag has been set to an invalid value @@ -883,7 +884,9 @@ TEST(RootSignature, ParseRootFlags) { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, }; - EXPECT_THAT_EXPECTED(DXContainer::create(getMemoryBuffer<68>(Buffer)), - FailedWithMessage("Invalid Root Signature flag")); + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<100>(Buffer)), + FailedWithMessage( + "unsupported root signature flag value read: 4278190081")); } } From 5d4505c5ad46d316acf0c219b707f7b0704ad975 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 12 Feb 2025 23:31:59 +0000 Subject: [PATCH 189/220] clean up --- llvm/lib/Object/DXContainer.cpp | 2 -- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 1 - 2 files changed, 3 deletions(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 30a8d5264e86e..1eb1453c65147 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -7,12 +7,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/DXContainer.h" -#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 09be5e3aff330..347024b5d8779 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -29,7 +29,6 @@ #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include using namespace llvm; using namespace llvm::dxil; From 3117530c82f191fca265c9addac39e5c8a57cc2b Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 13 Feb 2025 06:47:39 +0000 Subject: [PATCH 190/220] addressing pr comments --- .../llvm/MC/DXContainerRootSignature.h | 7 +- .../include/llvm/ObjectYAML/DXContainerYAML.h | 12 ++-- llvm/lib/MC/DXContainerRootSignature.cpp | 3 +- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 18 ++--- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 8 +-- llvm/lib/Target/DirectX/CMakeLists.txt | 1 + .../lib/Target/DirectX/DXContainerGlobals.cpp | 16 ++--- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 67 ++++++++++--------- llvm/lib/Target/DirectX/DXILRootSignature.h | 26 +++---- .../ContainerData/RootSignature-Error.ll | 2 +- .../RootSignature-Flags-Error.ll | 2 +- .../RootSignature-Flags-Validation-Error.ll | 2 +- .../RootSignature-MultipleEntryFunctions.ll | 20 ++++-- .../RootSignature-RootElement-Error.ll | 2 +- llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 2 +- 15 files changed, 97 insertions(+), 91 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 08b87c6ca97a9..f7d56e115016a 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,10 +15,7 @@ namespace llvm { class raw_ostream; namespace mcdxbc { - - - -struct RootSignatureHeader { +struct RootSignatureDesc { uint32_t Version = 2; uint32_t NumParameters = 0; uint32_t RootParametersOffset = 0; @@ -26,7 +23,7 @@ struct RootSignatureHeader { uint32_t StaticSamplersOffset = 0; uint32_t Flags = 0; - void write(raw_ostream &OS); + void write(raw_ostream &OS) const; }; struct RootConstants { diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 0200f5cb196ff..ecad35e82b155 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -74,9 +74,9 @@ struct ShaderHash { }; #define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; -struct RootSignatureDesc { - RootSignatureDesc() = default; - RootSignatureDesc(const object::DirectX::RootSignature &Data); +struct RootSignatureYamlDesc { + RootSignatureYamlDesc() = default; + RootSignatureYamlDesc(const object::DirectX::RootSignature &Data); uint32_t Version; uint32_t NumParameters; @@ -176,7 +176,7 @@ struct Part { std::optional Hash; std::optional Info; std::optional Signature; - std::optional RootSignature; + std::optional RootSignature; }; struct Object { @@ -259,9 +259,9 @@ template <> struct MappingTraits { static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El); }; -template <> struct MappingTraits { +template <> struct MappingTraits { static void mapping(IO &IO, - DXContainerYAML::RootSignatureDesc &RootSignature); + DXContainerYAML::RootSignatureYamlDesc &RootSignature); }; } // namespace yaml diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 2b0ee368f7946..27a19962b5ef2 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -13,7 +13,8 @@ using namespace llvm; using namespace llvm::mcdxbc; -void RootSignatureHeader::write(raw_ostream &OS) { +void RootSignatureDesc::write(raw_ostream &OS) const { + support::endian::write(OS, Version, llvm::endianness::little); support::endian::write(OS, NumParameters, llvm::endianness::little); support::endian::write(OS, RootParametersOffset, llvm::endianness::little); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index b7d1c6558fa1f..f6ed09c857bb7 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -266,15 +266,15 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { if (!P.RootSignature.has_value()) continue; - mcdxbc::RootSignatureHeader Header; - Header.Flags = P.RootSignature->getEncodedFlags(); - Header.Version = P.RootSignature->Version; - Header.NumParameters = P.RootSignature->NumParameters; - Header.RootParametersOffset = P.RootSignature->RootParametersOffset; - Header.NumStaticSamplers = P.RootSignature->NumStaticSamplers; - Header.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; - - Header.write(OS); + mcdxbc::RootSignatureDesc RS; + RS.Flags = P.RootSignature->getEncodedFlags(); + RS.Version = P.RootSignature->Version; + RS.NumParameters = P.RootSignature->NumParameters; + RS.RootParametersOffset = P.RootSignature->RootParametersOffset; + RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers; + RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; + + RS.write(OS); break; } uint64_t BytesWritten = OS.tell() - DataStart; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 0869fd4fa9785..f03c7da65999d 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -29,7 +29,7 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { #include "llvm/BinaryFormat/DXContainerConstants.def" } -DXContainerYAML::RootSignatureDesc::RootSignatureDesc( +DXContainerYAML::RootSignatureYamlDesc::RootSignatureYamlDesc( const object::DirectX::RootSignature &Data) : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), RootParametersOffset(Data.getRootParametersOffset()), @@ -41,7 +41,7 @@ DXContainerYAML::RootSignatureDesc::RootSignatureDesc( #include "llvm/BinaryFormat/DXContainerConstants.def" } -uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { +uint32_t DXContainerYAML::RootSignatureYamlDesc::getEncodedFlags() { uint64_t Flag = 0; #define ROOT_ELEMENT_FLAG(Num, Val) \ if (Val) \ @@ -209,8 +209,8 @@ void MappingTraits::mapping( IO.mapRequired("Parameters", S.Parameters); } -void MappingTraits::mapping( - IO &IO, DXContainerYAML::RootSignatureDesc &S) { +void MappingTraits::mapping( + IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) { IO.mapRequired("Version", S.Version); IO.mapRequired("NumParameters", S.NumParameters); IO.mapRequired("RootParametersOffset", S.RootParametersOffset); diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt index 89fe494dea71c..5a167535b0afa 100644 --- a/llvm/lib/Target/DirectX/CMakeLists.txt +++ b/llvm/lib/Target/DirectX/CMakeLists.txt @@ -34,6 +34,7 @@ add_llvm_target(DirectXCodeGen DXILShaderFlags.cpp DXILTranslateMetadata.cpp DXILRootSignature.cpp + LINK_COMPONENTS Analysis AsmPrinter diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 347024b5d8779..26e2bbaa6c894 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -24,11 +24,11 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/MC/DXContainerPSVInfo.h" -#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include using namespace llvm; using namespace llvm::dxil; @@ -156,26 +156,24 @@ void DXContainerGlobals::addRootSignature(Module &M, dxil::ModuleMetadataInfo &MMI = getAnalysis().getModuleMetadata(); - // Root Signature in Library shaders are different, - // since they don't use DXContainer to share it. + // Root Signature in Library don't compile to DXContainer. if (MMI.ShaderProfile == llvm::Triple::Library) return; assert(MMI.EntryPropertyVec.size() == 1); auto &RSA = getAnalysis(); - const Function *&EntryFunction = MMI.EntryPropertyVec[0].Entry; + const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; + const std::optional &MaybeRS = + RSA.getForFunction(EntryFunction); - if (!RSA.hasForFunction(EntryFunction)) + if (!MaybeRS.has_value()) return; - const ModuleRootSignature &MRS = RSA.getForFunction(EntryFunction); + const RootSignatureDesc &RSH = MaybeRS.value(); SmallString<256> Data; raw_svector_ostream OS(Data); - RootSignatureHeader RSH; - RSH.Flags = MRS.Flags; - RSH.write(OS); Constant *Constant = diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index ecca7abfe3e99..618eee1113572 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -26,6 +26,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -39,20 +40,20 @@ static bool reportError(LLVMContext *Ctx, Twine Message, return true; } -static bool parseRootFlags(LLVMContext *Ctx, ModuleRootSignature &MRS, +static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, MDNode *RootFlagNode) { if (RootFlagNode->getNumOperands() != 2) return reportError(Ctx, "Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - MRS.Flags = Flag->getZExtValue(); + RSD.Flags = Flag->getZExtValue(); return false; } static bool parseRootSignatureElement(LLVMContext *Ctx, - ModuleRootSignature &MRS, + mcdxbc::RootSignatureDesc &RSD, MDNode *Element) { MDString *ElementText = cast(Element->getOperand(0)); if (ElementText == nullptr) @@ -61,21 +62,22 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, RootSignatureElementKind ElementKind = StringSwitch(ElementText->getString()) .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Default(RootSignatureElementKind::None); + .Default(RootSignatureElementKind::Error); switch (ElementKind) { case RootSignatureElementKind::RootFlags: - return parseRootFlags(Ctx, MRS, Element); - case RootSignatureElementKind::None: + return parseRootFlags(Ctx, RSD, Element); + case RootSignatureElementKind::Error: return reportError(Ctx, "Invalid Root Signature Element: " + ElementText->getString()); } - llvm_unreachable("Root signature element kind not expected."); + llvm_unreachable("Unhandled RootSignatureElementKind enum."); } -static bool parse(LLVMContext *Ctx, ModuleRootSignature &MRS, MDNode *Node) { +static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, + MDNode *Node) { bool HasError = false; // Loop through the Root Elements of the root signature. @@ -84,20 +86,20 @@ static bool parse(LLVMContext *Ctx, ModuleRootSignature &MRS, MDNode *Node) { if (Element == nullptr) return reportError(Ctx, "Missing Root Element Metadata Node."); - HasError = HasError || parseRootSignatureElement(Ctx, MRS, Element); + HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element); } return HasError; } -static bool validate(LLVMContext *Ctx, const ModuleRootSignature &MRS) { - if (!dxbc::RootSignatureValidations::isValidRootFlag(MRS.Flags)) { +static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { + if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Flags)) { return reportError(Ctx, "Invalid Root Signature flag value"); } return false; } -static SmallDenseMap +static SmallDenseMap analyzeModule(Module &M) { /** Root Signature are specified as following in the metadata: @@ -114,11 +116,11 @@ analyzeModule(Module &M) { LLVMContext *Ctx = &M.getContext(); - SmallDenseMap MRSMap; + SmallDenseMap RSDMap; NamedMDNode *RootSignatureNode = M.getNamedMetadata("dx.rootsignatures"); if (RootSignatureNode == nullptr) - return MRSMap; + return RSDMap; for (const auto &RSDefNode : RootSignatureNode->operands()) { if (RSDefNode->getNumOperands() != 2) { @@ -153,49 +155,50 @@ analyzeModule(Module &M) { reportError(Ctx, "Missing Root Element List Metadata node."); } - ModuleRootSignature MRS; + mcdxbc::RootSignatureDesc RSD; - if (parse(Ctx, MRS, RootElementListNode) || validate(Ctx, MRS)) { - return MRSMap; + if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) { + return RSDMap; } - MRSMap.insert(std::make_pair(F, MRS)); + RSDMap.insert(std::make_pair(F, RSD)); } - return MRSMap; + return RSDMap; } AnalysisKey RootSignatureAnalysis::Key; -SmallDenseMap +SmallDenseMap RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) { return analyzeModule(M); } //===----------------------------------------------------------------------===// -static void printSpaces(raw_ostream &Stream, unsigned int Count) { - for (unsigned int I = 0; I < Count; ++I) { - Stream << ' '; - } -} - PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, ModuleAnalysisManager &AM) { - SmallDenseMap &MRSMap = + SmallDenseMap &RSDMap = AM.getResult(M); OS << "Root Signature Definitions" << "\n"; uint8_t Space = 0; - for (const auto &P : MRSMap) { - const auto &[Function, MRS] = P; + for (const auto &P : RSDMap) { + const auto &[Function, RSD] = P; OS << "Definition for '" << Function->getName() << "':\n"; // start root signature header Space++; - printSpaces(OS, Space); - OS << "Flags: " << format_hex(MRS.Flags, 8) << ":\n"; + OS << indent(Space) << "Flags: " << format_hex(RSD.Flags, 8) << ":\n"; + OS << indent(Space) << "Version: " << RSD.Version << ":\n"; + OS << indent(Space) << "NumParameters: " << RSD.NumParameters << ":\n"; + OS << indent(Space) << "RootParametersOffset: " << RSD.RootParametersOffset + << ":\n"; + OS << indent(Space) << "NumStaticSamplers: " << RSD.NumStaticSamplers + << ":\n"; + OS << indent(Space) << "StaticSamplersOffset: " << RSD.StaticSamplersOffset + << ":\n"; Space--; // end root signature header } @@ -205,7 +208,7 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, //===----------------------------------------------------------------------===// bool RootSignatureAnalysisWrapper::runOnModule(Module &M) { - MRS = analyzeModule(M); + FuncToRsMap = analyzeModule(M); return false; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 95970f2048767..7697703ff31d1 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -17,18 +17,14 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" +#include namespace llvm { namespace dxil { -enum class RootSignatureElementKind { None = 0, RootFlags = 1 }; - -struct ModuleRootSignature { - ModuleRootSignature() = default; - uint32_t Flags = 0; -}; - +enum class RootSignatureElementKind { Error = 0, RootFlags = 1 }; class RootSignatureAnalysis : public AnalysisInfoMixin { friend AnalysisInfoMixin; static AnalysisKey Key; @@ -36,9 +32,9 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { public: RootSignatureAnalysis() = default; - using Result = SmallDenseMap; + using Result = SmallDenseMap; - SmallDenseMap + SmallDenseMap run(Module &M, ModuleAnalysisManager &AM); }; @@ -48,18 +44,18 @@ class RootSignatureAnalysis : public AnalysisInfoMixin { /// passes which run through the legacy pass manager. class RootSignatureAnalysisWrapper : public ModulePass { private: - SmallDenseMap MRS; + SmallDenseMap FuncToRsMap; public: static char ID; RootSignatureAnalysisWrapper() : ModulePass(ID) {} - bool hasForFunction(const Function *F) { return MRS.find(F) != MRS.end(); } - - ModuleRootSignature getForFunction(const Function *F) { - assert(hasForFunction(F)); - return MRS[F]; + std::optional getForFunction(const Function *F) { + auto Lookup = FuncToRsMap.find(F); + if (Lookup == FuncToRsMap.end()) + return std::nullopt; + return Lookup->second; } bool runOnModule(Module &M) override; diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll index 44a012c92c9d7..2a2188b1a13bb 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll @@ -3,7 +3,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Invalid format for Root Signature Definition. Pairs of function, root signature expected. -; CHECK-NO: Root Signature Definitions +; CHECK-NOT: Root Signature Definitions define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll index 25599d4d345b9..4921472d253ad 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll @@ -3,7 +3,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Invalid Root Signature Element: NOTRootFlags -; CHECK-NO: Root Signature Definitions +; CHECK-NOT: Root Signature Definitions define void @main() #0 { diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll index 0a2fb552af278..fe93c9993c1c3 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Validation-Error.ll @@ -1,7 +1,7 @@ ; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s ; CHECK: error: Invalid Root Signature flag value -; CHECK-NO: Root Signature Definitions +; CHECK-NOT: Root Signature Definitions target triple = "dxil-unknown-shadermodel6.0-compute" diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index f81927d9229f9..652f8092b7a69 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -24,8 +24,18 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout -; CHECK: Root Signature Definitions -; CHECK-NEXT: Definition for 'main': -; CHECK-NEXT: Flags: 0x000001: -; CHECK-NEXT: Definition for 'anotherMain': -; CHECK-NEXT: Flags: 0x000002: +; CHECK-LABEL: Definition for 'main': +; CHECK-NEXT: Flags: 0x000001 +; CHECK-NEXT: Version: 2 +; CHECK-NEXT: NumParameters: 0 +; CHECK-NEXT: RootParametersOffset: 0 +; CHECK-NEXT: NumStaticSamplers: 0 +; CHECK-NEXT: StaticSamplersOffset: 0 + +; CHECK-LABEL: Definition for 'anotherMain': +; CHECK-NEXT: Flags: 0x000002 +; CHECK-NEXT: Version: 2 +; CHECK-NEXT: NumParameters: 0 +; CHECK-NEXT: RootParametersOffset: 0 +; CHECK-NEXT: NumStaticSamplers: 0 +; CHECK-NEXT: StaticSamplersOffset: 0 diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll index 3680c162a4f5c..89e23f6540c5f 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll @@ -3,7 +3,7 @@ target triple = "dxil-unknown-shadermodel6.0-compute" ; CHECK: error: Missing Root Element Metadata Node. -; CHECK-NO: Root Signature Definitions +; CHECK-NOT: Root Signature Definitions define void @main() #0 { diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index 54a912d9438af..f3ef1b6a27bcf 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -156,7 +156,7 @@ dumpDXContainer(MemoryBufferRef Source) { case dxbc::PartType::RTS0: std::optional RS = Container.getRootSignature(); if (RS.has_value()) - NewPart.RootSignature = DXContainerYAML::RootSignatureDesc(*RS); + NewPart.RootSignature = DXContainerYAML::RootSignatureYamlDesc(*RS); break; } } From 0af845c40bb5f205285527482053fbaebefe34de Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 13 Feb 2025 18:09:23 +0000 Subject: [PATCH 191/220] implementing find interface for RootSignatureAnalysisWrapper --- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 9 ++++----- llvm/lib/Target/DirectX/DXILRootSignature.h | 12 ++++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 26e2bbaa6c894..268565b867091 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -164,17 +164,16 @@ void DXContainerGlobals::addRootSignature(Module &M, auto &RSA = getAnalysis(); const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry; - const std::optional &MaybeRS = - RSA.getForFunction(EntryFunction); + const auto &FuncRs = RSA.find(EntryFunction); - if (!MaybeRS.has_value()) + if (FuncRs == RSA.end()) return; - const RootSignatureDesc &RSH = MaybeRS.value(); + const RootSignatureDesc &RS = FuncRs->getSecond(); SmallString<256> Data; raw_svector_ostream OS(Data); - RSH.write(OS); + RS.write(OS); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 7697703ff31d1..8c25b2eb3fadf 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -51,12 +51,12 @@ class RootSignatureAnalysisWrapper : public ModulePass { RootSignatureAnalysisWrapper() : ModulePass(ID) {} - std::optional getForFunction(const Function *F) { - auto Lookup = FuncToRsMap.find(F); - if (Lookup == FuncToRsMap.end()) - return std::nullopt; - return Lookup->second; - } + using iterator = + SmallDenseMap::iterator; + + iterator find(const Function *F) { return FuncToRsMap.find(F); } + + iterator end() { return FuncToRsMap.end(); } bool runOnModule(Module &M) override; From bf49a3adeef6b980abcb21ce32ea098813593c17 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 13 Feb 2025 18:46:55 +0000 Subject: [PATCH 192/220] adding test for null function --- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 2 +- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 268565b867091..5508af40663b1 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -169,7 +169,7 @@ void DXContainerGlobals::addRootSignature(Module &M, if (FuncRs == RSA.end()) return; - const RootSignatureDesc &RS = FuncRs->getSecond(); + const RootSignatureDesc &RS = FuncRs->second; SmallString<256> Data; raw_svector_ostream OS(Data); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 618eee1113572..49fc892eade5d 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -132,6 +132,8 @@ analyzeModule(Module &M) { // Function was pruned during compilation. const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0); if (FunctionPointerMdNode == nullptr) { + reportError( + Ctx, "Function associated with Root Signature definition is null."); continue; } From 78826a5bfb7ef1d341335167af202f8400e75066 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 14 Feb 2025 20:55:00 +0000 Subject: [PATCH 193/220] fix root signature test error --- .../RootSignature-MultipleEntryFunctions.ll | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index 652f8092b7a69..0547b0bae7a7e 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -23,17 +23,16 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !6 = !{ !7 } ; list of root signature elements !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout - -; CHECK-LABEL: Definition for 'main': -; CHECK-NEXT: Flags: 0x000001 +; CHECK-LABEL: Definition for 'anotherMain': +; CHECK-NEXT: Flags: 0x000002 ; CHECK-NEXT: Version: 2 ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 ; CHECK-NEXT: NumStaticSamplers: 0 ; CHECK-NEXT: StaticSamplersOffset: 0 -; CHECK-LABEL: Definition for 'anotherMain': -; CHECK-NEXT: Flags: 0x000002 +; CHECK-LABEL: Definition for 'main': +; CHECK-NEXT: Flags: 0x000001 ; CHECK-NEXT: Version: 2 ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 From 4e689e912911e39980548605b3d07cc79fb5e670 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 14 Feb 2025 21:35:38 +0000 Subject: [PATCH 194/220] fix other functions are checked --- .../ContainerData/RootSignature-MultipleEntryFunctions.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index 0547b0bae7a7e..7adb17d0b022f 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -23,16 +23,16 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !6 = !{ !7 } ; list of root signature elements !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout -; CHECK-LABEL: Definition for 'anotherMain': -; CHECK-NEXT: Flags: 0x000002 +; CHECK-LABEL: Definition for 'main': +; CHECK-NEXT: Flags: 0x000001 ; CHECK-NEXT: Version: 2 ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 ; CHECK-NEXT: NumStaticSamplers: 0 ; CHECK-NEXT: StaticSamplersOffset: 0 -; CHECK-LABEL: Definition for 'main': -; CHECK-NEXT: Flags: 0x000001 +; CHECK-LABEL: Definition for 'anotherMain': +; CHECK-NEXT: Flags: 0x000002 ; CHECK-NEXT: Version: 2 ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 From b0d0180d74cd073c1d2ba9f612989f54f78ad70c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 14 Feb 2025 22:48:24 +0000 Subject: [PATCH 195/220] adding missing continue --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 49fc892eade5d..b935ef0c1611d 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -155,6 +155,7 @@ analyzeModule(Module &M) { if (RootElementListNode == nullptr) { reportError(Ctx, "Missing Root Element List Metadata node."); + continue; } mcdxbc::RootSignatureDesc RSD; From 3c6894f76d1d4a52d4eaede5f290f190c604c323 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Fri, 14 Feb 2025 23:52:13 +0000 Subject: [PATCH 196/220] adding few more tests --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index b935ef0c1611d..83989161ac121 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -150,11 +150,16 @@ analyzeModule(Module &M) { continue; } - MDNode *RootElementListNode = - dyn_cast(RSDefNode->getOperand(1).get()); + Metadata *RootElementListOperand = RSDefNode->getOperand(1).get(); + if (RootElementListOperand == nullptr) { + reportError(Ctx, "Root Element mdnode is null."); + continue; + } + + MDNode *RootElementListNode = dyn_cast(RootElementListOperand); if (RootElementListNode == nullptr) { - reportError(Ctx, "Missing Root Element List Metadata node."); + reportError(Ctx, "Root Element is not a metadata node."); continue; } From 08f6ddc579c7510a37d15127cd8eecf66fb5b221 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 15 Feb 2025 01:22:18 +0000 Subject: [PATCH 197/220] adding yaml2obj support --- llvm/include/llvm/BinaryFormat/DXContainer.h | 24 ++++++- .../llvm/MC/DXContainerRootSignature.h | 9 +-- .../include/llvm/ObjectYAML/DXContainerYAML.h | 18 ++++- llvm/lib/BinaryFormat/DXContainer.cpp | 20 ++++++ llvm/lib/MC/DXContainerRootSignature.cpp | 70 +++++++++++++------ llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 10 ++- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 39 +++++++++-- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 16 ++--- 8 files changed, 158 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 0f495f5935995..bd9c8385b5002 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -160,14 +160,14 @@ enum class RootElementFlag : uint32_t { }; #define ROOT_PARAMETER(Val, Enum) Enum = Val, -enum class RootParameterType : uint8_t { +enum class RootParameterType: uint32_t { #include "DXContainerConstants.def" }; ArrayRef> getRootParameterTypes(); #define SHADER_VISIBILITY(Val, Enum) Enum = Val, -enum class ShaderVisibilityFlag : uint8_t { +enum class ShaderVisibilityFlag: uint32_t { #include "DXContainerConstants.def" }; @@ -562,6 +562,26 @@ struct ProgramSignatureElement { static_assert(sizeof(ProgramSignatureElement) == 32, "ProgramSignatureElement is misaligned"); +struct RootConstants { + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; +}; + +struct RootParameter { +dxbc::RootParameterType ParameterType; +union { + RootConstants Constants; +}; +dxbc::ShaderVisibilityFlag ShaderVisibility; +}; + +struct RootSignatureHeader { + uint32_t Version = 2; + uint32_t Flags = 0; +}; + + struct RootSignatureValidations { static bool isValidRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index f7d56e115016a..6909ae913816e 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,13 +15,10 @@ namespace llvm { class raw_ostream; namespace mcdxbc { + struct RootSignatureDesc { - uint32_t Version = 2; - uint32_t NumParameters = 0; - uint32_t RootParametersOffset = 0; - uint32_t NumStaticSamplers = 0; - uint32_t StaticSamplersOffset = 0; - uint32_t Flags = 0; + dxbc::RootSignatureHeader Header; + SmallVector Parameters; void write(raw_ostream &OS) const; }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index ecad35e82b155..1eb365899dc7f 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -79,11 +79,12 @@ struct RootSignatureYamlDesc { RootSignatureYamlDesc(const object::DirectX::RootSignature &Data); uint32_t Version; - uint32_t NumParameters; - uint32_t RootParametersOffset; uint32_t NumStaticSamplers; uint32_t StaticSamplersOffset; + SmallVector Parameters; + + uint32_t getEncodedFlags(); #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -192,6 +193,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::ResourceBindInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureElement) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::PSVInfo::MaskVector) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DXContainerYAML::SignatureParameter) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::dxbc::RootParameter) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::SemanticKind) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode) @@ -200,6 +202,8 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::RootParameterType) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::ShaderVisibilityFlag) namespace llvm { @@ -264,6 +268,16 @@ template <> struct MappingTraits { DXContainerYAML::RootSignatureYamlDesc &RootSignature); }; +template <> struct MappingTraits { + static void mapping(IO &IO, + dxbc::RootParameter &P); +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, + dxbc::RootConstants &C); +}; + } // namespace yaml } // namespace llvm diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 97ceb16ccf53f..d1a1f1f544137 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,6 +60,26 @@ ArrayRef> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } +#define SHADER_VISIBILITY(Val, Enum) {#Enum, ShaderVisibilityFlag::Enum}, + +static const EnumEntry ShaderVisibilityFlags[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getShaderVisibilityFlags() { + return ArrayRef(ShaderVisibilityFlags); +} + +#define ROOT_PARAMETER(Val, Enum) {#Enum, RootParameterType::Enum}, + +static const EnumEntry RootParameterTypes[] = { +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getRootParameterTypes() { + return ArrayRef(RootParameterTypes); +} + #define SEMANTIC_KIND(Val, Enum) {#Enum, PSV::SemanticKind::Enum}, static const EnumEntry SemanticKindNames[] = { diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 27a19962b5ef2..04ed6cd73186d 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -8,34 +8,64 @@ #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/ADT/bit.h" +#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/EndianStream.h" +#include +#include using namespace llvm; using namespace llvm::mcdxbc; +template +static uint32_t getSizeOf () { + return static_cast(sizeof(T)); +} + + + void RootSignatureDesc::write(raw_ostream &OS) const { + uint32_t Offset = 16; + const uint32_t ParametersOffset = getSizeOf() + Offset; + const uint32_t ParameterByteSize = Parameters.size_in_bytes(); - support::endian::write(OS, Version, llvm::endianness::little); - support::endian::write(OS, NumParameters, llvm::endianness::little); - support::endian::write(OS, RootParametersOffset, llvm::endianness::little); - support::endian::write(OS, NumStaticSamplers, llvm::endianness::little); - support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); - support::endian::write(OS, Flags, llvm::endianness::little); -} -void RootParameter::write(raw_ostream &OS) { - support::endian::write(OS, ParameterType, llvm::endianness::little); - support::endian::write(OS, ShaderVisibility, llvm::endianness::little); + // Writing header information + support::endian::write(OS, Header.Version, llvm::endianness::little); + Offset += getSizeOf(); - switch(ParameterType){ - case dxbc::RootParameterType::Constants32Bit: - Constants.write(OS); - break; - } -} + support::endian::write(OS, (uint32_t)Parameters.size(), llvm::endianness::little); + Offset += getSizeOf(); -void RootConstants::write(raw_ostream &OS) { - support::endian::write(OS, Num32BitValues, llvm::endianness::little); - support::endian::write(OS, RegisterSpace, llvm::endianness::little); - support::endian::write(OS, ShaderRegister, llvm::endianness::little); + support::endian::write(OS, ParametersOffset, llvm::endianness::little); + Offset += getSizeOf(); + + support::endian::write(OS, ((uint32_t)0), llvm::endianness::little); + Offset += getSizeOf(); + + support::endian::write(OS, ParameterByteSize + ParametersOffset, llvm::endianness::little); + Offset += getSizeOf(); + + support::endian::write(OS, Header.Flags, llvm::endianness::little); + + for (const dxbc::RootParameter &P : Parameters){ + support::endian::write(OS, P.ParameterType, llvm::endianness::little); + support::endian::write(OS, P.ShaderVisibility, llvm::endianness::little); + support::endian::write(OS, Offset, llvm::endianness::little); + Offset += getSizeOf(); + + + switch(P.ParameterType){ + case dxbc::RootParameterType::Constants32Bit:{ + support::endian::write(OS, P.Constants.ShaderRegister, llvm::endianness::little); + Offset += getSizeOf(); + + support::endian::write(OS, P.Constants.RegisterSpace, llvm::endianness::little); + Offset += getSizeOf(); + + support::endian::write(OS, P.Constants.Num32BitValues, llvm::endianness::little); + Offset += getSizeOf(); + + } break; + } + } } diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index f6ed09c857bb7..eef2f0e8c366e 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" +#include using namespace llvm; @@ -267,12 +268,9 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { continue; mcdxbc::RootSignatureDesc RS; - RS.Flags = P.RootSignature->getEncodedFlags(); - RS.Version = P.RootSignature->Version; - RS.NumParameters = P.RootSignature->NumParameters; - RS.RootParametersOffset = P.RootSignature->RootParametersOffset; - RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers; - RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; + RS.Header.Flags = P.RootSignature->getEncodedFlags(); + RS.Header.Version = P.RootSignature->Version; + RS.Parameters = std::move(P.RootSignature->Parameters); RS.write(OS); break; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index f03c7da65999d..39e6241d51aab 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -31,8 +31,7 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { DXContainerYAML::RootSignatureYamlDesc::RootSignatureYamlDesc( const object::DirectX::RootSignature &Data) - : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), - RootParametersOffset(Data.getRootParametersOffset()), + : Version(Data.getVersion()), NumStaticSamplers(Data.getNumStaticSamplers()), StaticSamplersOffset(Data.getStaticSamplersOffset()) { uint32_t Flags = Data.getFlags(); @@ -212,14 +211,34 @@ void MappingTraits::mapping( void MappingTraits::mapping( IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) { IO.mapRequired("Version", S.Version); - IO.mapRequired("NumParameters", S.NumParameters); - IO.mapRequired("RootParametersOffset", S.RootParametersOffset); IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); + IO.mapRequired("Parameters", S.Parameters); #define ROOT_ELEMENT_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); #include "llvm/BinaryFormat/DXContainerConstants.def" } +void MappingTraits::mapping( + IO &IO, dxbc::RootConstants &C) { + IO.mapRequired("Num32BitValues", C.Num32BitValues); + IO.mapRequired("RegisterSpace", C.RegisterSpace); + IO.mapRequired("ShaderRegister", C.ShaderRegister); + + } + +void MappingTraits::mapping( + IO &IO, dxbc::RootParameter &P) { + IO.mapRequired("ParameterType", P.ParameterType); + IO.mapRequired("ShaderVisibility", P.ShaderVisibility); + switch (P.ParameterType) { + + case dxbc::RootParameterType::Constants32Bit: + IO.mapRequired("Constants", P.Constants); + + break; + } + } + void MappingTraits::mapping(IO &IO, DXContainerYAML::Part &P) { IO.mapRequired("Name", P.Name); @@ -323,6 +342,18 @@ void ScalarEnumerationTraits::enumeration( IO.enumCase(Value, E.Name.str().c_str(), E.Value); } +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::RootParameterType &Value) { +for (const auto &E : dxbc::getRootParameterTypes()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::ShaderVisibilityFlag &Value) { +for (const auto &E : dxbc::getShaderVisibilityFlags()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); +} + } // namespace yaml void DXContainerYAML::PSVInfo::mapInfoForVersion(yaml::IO &IO) { diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 83989161ac121..104cd30d1fa75 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -47,7 +47,7 @@ static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, return reportError(Ctx, "Invalid format for RootFlag Element"); auto *Flag = mdconst::extract(RootFlagNode->getOperand(1)); - RSD.Flags = Flag->getZExtValue(); + RSD.Header.Flags = Flag->getZExtValue(); return false; } @@ -93,7 +93,7 @@ static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, } static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { - if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Flags)) { + if (!dxbc::RootSignatureValidations::isValidRootFlag(RSD.Header.Flags)) { return reportError(Ctx, "Invalid Root Signature flag value"); } return false; @@ -198,14 +198,14 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, // start root signature header Space++; - OS << indent(Space) << "Flags: " << format_hex(RSD.Flags, 8) << ":\n"; - OS << indent(Space) << "Version: " << RSD.Version << ":\n"; - OS << indent(Space) << "NumParameters: " << RSD.NumParameters << ":\n"; - OS << indent(Space) << "RootParametersOffset: " << RSD.RootParametersOffset + OS << indent(Space) << "Flags: " << format_hex(RS.Header.Flags, 8) << ":\n"; + OS << indent(Space) << "Version: " << RS.Header.Version << ":\n"; + OS << indent(Space) << "NumParameters: " << RS.Parameters.size() << ":\n"; + OS << indent(Space) << "RootParametersOffset: " << RS.Parameters.size_in_bytes() << ":\n"; - OS << indent(Space) << "NumStaticSamplers: " << RSD.NumStaticSamplers + OS << indent(Space) << "NumStaticSamplers: " << 0 << ":\n"; - OS << indent(Space) << "StaticSamplersOffset: " << RSD.StaticSamplersOffset + OS << indent(Space) << "StaticSamplersOffset: " << sizeof(RS.Header) + RS.Parameters.size_in_bytes() << ":\n"; Space--; // end root signature header From b2329676db3923cca67c3f9cd12df4b2559aebdd Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 18 Feb 2025 21:45:59 +0000 Subject: [PATCH 198/220] adding support for obj2yaml and initial tests --- llvm/include/llvm/BinaryFormat/DXContainer.h | 35 ++++++++--- .../llvm/MC/DXContainerRootSignature.h | 2 +- llvm/include/llvm/Object/DXContainer.h | 4 ++ .../include/llvm/ObjectYAML/DXContainerYAML.h | 7 +-- llvm/lib/MC/DXContainerRootSignature.cpp | 58 +++++++++---------- llvm/lib/Object/DXContainer.cpp | 34 +++++++++++ llvm/lib/ObjectYAML/DXContainerYAML.cpp | 44 +++++++------- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 11 ++-- .../ContainerData/RootSignature-Flags.ll | 5 +- .../RootSignature-MultipleEntryFunctions.ll | 4 +- .../DXContainer/RootSignature-Flags.yaml | 10 ++-- llvm/unittests/Object/DXContainerTest.cpp | 29 ++++++++++ .../ObjectYAML/DXContainerYAMLTest.cpp | 57 ++++++++++++++++-- 13 files changed, 212 insertions(+), 88 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index bd9c8385b5002..a6bbf773ccc75 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -160,14 +160,14 @@ enum class RootElementFlag : uint32_t { }; #define ROOT_PARAMETER(Val, Enum) Enum = Val, -enum class RootParameterType: uint32_t { +enum class RootParameterType : uint32_t { #include "DXContainerConstants.def" }; ArrayRef> getRootParameterTypes(); #define SHADER_VISIBILITY(Val, Enum) Enum = Val, -enum class ShaderVisibilityFlag: uint32_t { +enum class ShaderVisibilityFlag : uint32_t { #include "DXContainerConstants.def" }; @@ -566,21 +566,40 @@ struct RootConstants { uint32_t ShaderRegister; uint32_t RegisterSpace; uint32_t Num32BitValues; + + void swapBytes() { + sys::swapByteOrder(ShaderRegister); + sys::swapByteOrder(RegisterSpace); + sys::swapByteOrder(Num32BitValues); + } }; struct RootParameter { -dxbc::RootParameterType ParameterType; -union { - RootConstants Constants; -}; -dxbc::ShaderVisibilityFlag ShaderVisibility; + dxbc::RootParameterType ParameterType; + union { + RootConstants Constants; + }; + dxbc::ShaderVisibilityFlag ShaderVisibility; + + void swapBytes() { + sys::swapByteOrder(ParameterType); + sys::swapByteOrder(ShaderVisibility); + switch (ParameterType) { + + case RootParameterType::Constants32Bit: + Constants.swapBytes(); + break; + } + } }; struct RootSignatureHeader { uint32_t Version = 2; uint32_t Flags = 0; + void swapBytes() { + // nothing to swap + } }; - struct RootSignatureValidations { diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 6909ae913816e..704490029c32f 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,7 +15,7 @@ namespace llvm { class raw_ostream; namespace mcdxbc { - + struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index c3a2f756bd683..ae8720d469d89 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -22,6 +22,7 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" #include +#include #include namespace llvm { @@ -126,6 +127,8 @@ class RootSignature { uint32_t StaticSamplersOffset; uint32_t Flags; + SmallVector Parameters; + public: RootSignature() {} @@ -135,6 +138,7 @@ class RootSignature { uint32_t getRootParametersOffset() const { return RootParametersOffset; } uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } + SmallVector getParameters() const { return Parameters; } uint32_t getFlags() const { return Flags; } }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 1eb365899dc7f..0bd5cf8ba5a60 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -84,7 +84,6 @@ struct RootSignatureYamlDesc { SmallVector Parameters; - uint32_t getEncodedFlags(); #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -269,13 +268,11 @@ template <> struct MappingTraits { }; template <> struct MappingTraits { - static void mapping(IO &IO, - dxbc::RootParameter &P); + static void mapping(IO &IO, dxbc::RootParameter &P); }; template <> struct MappingTraits { - static void mapping(IO &IO, - dxbc::RootConstants &C); + static void mapping(IO &IO, dxbc::RootConstants &C); }; } // namespace yaml diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 04ed6cd73186d..f4295a6516285 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -16,56 +16,52 @@ using namespace llvm; using namespace llvm::mcdxbc; -template -static uint32_t getSizeOf () { - return static_cast(sizeof(T)); +template static uint32_t getSizeOf() { + return static_cast(sizeof(T)); } - - void RootSignatureDesc::write(raw_ostream &OS) const { uint32_t Offset = 16; - const uint32_t ParametersOffset = getSizeOf() + Offset; + const uint32_t ParametersOffset = + getSizeOf() + Offset; const uint32_t ParameterByteSize = Parameters.size_in_bytes(); - // Writing header information support::endian::write(OS, Header.Version, llvm::endianness::little); - Offset += getSizeOf(); + Offset += getSizeOf(); - support::endian::write(OS, (uint32_t)Parameters.size(), llvm::endianness::little); - Offset += getSizeOf(); + support::endian::write(OS, (uint32_t)Parameters.size(), + llvm::endianness::little); + Offset += getSizeOf(); support::endian::write(OS, ParametersOffset, llvm::endianness::little); - Offset += getSizeOf(); + Offset += getSizeOf(); support::endian::write(OS, ((uint32_t)0), llvm::endianness::little); - Offset += getSizeOf(); + Offset += getSizeOf(); - support::endian::write(OS, ParameterByteSize + ParametersOffset, llvm::endianness::little); - Offset += getSizeOf(); + support::endian::write(OS, ParameterByteSize + ParametersOffset, + llvm::endianness::little); + Offset += getSizeOf(); support::endian::write(OS, Header.Flags, llvm::endianness::little); - for (const dxbc::RootParameter &P : Parameters){ - support::endian::write(OS, P.ParameterType, llvm::endianness::little); + for (const dxbc::RootParameter &P : Parameters) { + support::endian::write(OS, P.ParameterType, llvm::endianness::little); support::endian::write(OS, P.ShaderVisibility, llvm::endianness::little); support::endian::write(OS, Offset, llvm::endianness::little); - Offset += getSizeOf(); - - - switch(P.ParameterType){ - case dxbc::RootParameterType::Constants32Bit:{ - support::endian::write(OS, P.Constants.ShaderRegister, llvm::endianness::little); - Offset += getSizeOf(); - - support::endian::write(OS, P.Constants.RegisterSpace, llvm::endianness::little); - Offset += getSizeOf(); - - support::endian::write(OS, P.Constants.Num32BitValues, llvm::endianness::little); - Offset += getSizeOf(); - } break; - } + switch (P.ParameterType) { + case dxbc::RootParameterType::Constants32Bit: { + support::endian::write(OS, P.Constants.ShaderRegister, + llvm::endianness::little); + support::endian::write(OS, P.Constants.RegisterSpace, + llvm::endianness::little); + support::endian::write(OS, P.Constants.Num32BitValues, + llvm::endianness::little); + Offset += getSizeOf() + 3 * getSizeOf(); + + } break; + } } } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 1eb1453c65147..c83cb98557638 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -12,6 +12,8 @@ #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" #include "llvm/Support/FormatVariadic.h" +#include +#include using namespace llvm; using namespace llvm::object; @@ -247,6 +249,7 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { } Error DirectX::RootSignature::parse(StringRef Data) { + const char *Begin = Data.begin(); const char *Current = Data.begin(); // Root Signature headers expects 6 integers to be present. @@ -288,6 +291,37 @@ Error DirectX::RootSignature::parse(StringRef Data) { llvm::Twine(FValue)); Flags = FValue; + Current = Begin + RootParametersOffset; + for (uint32_t It = 0; It < NumParameters; It++) { + dxbc::RootParameter NewParam; + + NewParam.ParameterType = + support::endian::read(Current); + Current += sizeof(dxbc::RootParameterType); + + NewParam.ShaderVisibility = + support::endian::read(Current); + Current += sizeof(dxbc::ShaderVisibilityFlag); + + uint32_t Offset = + support::endian::read(Current); + Current += sizeof(uint32_t); + + switch (NewParam.ParameterType) { + + case dxbc::RootParameterType::Constants32Bit: { + if (Error Err = readStruct(Data, Current, NewParam.Constants)) + return Err; + if (sys::IsBigEndianHost) + NewParam.Constants.swapBytes(); + } break; + } + + Parameters.push_back(NewParam); + } + return Error::success(); } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 39e6241d51aab..fc328a55733e7 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -35,6 +35,7 @@ DXContainerYAML::RootSignatureYamlDesc::RootSignatureYamlDesc( NumStaticSamplers(Data.getNumStaticSamplers()), StaticSamplersOffset(Data.getStaticSamplersOffset()) { uint32_t Flags = Data.getFlags(); + Parameters = Data.getParameters(); #define ROOT_ELEMENT_FLAG(Num, Val) \ Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; #include "llvm/BinaryFormat/DXContainerConstants.def" @@ -218,26 +219,25 @@ void MappingTraits::mapping( #include "llvm/BinaryFormat/DXContainerConstants.def" } -void MappingTraits::mapping( - IO &IO, dxbc::RootConstants &C) { - IO.mapRequired("Num32BitValues", C.Num32BitValues); - IO.mapRequired("RegisterSpace", C.RegisterSpace); - IO.mapRequired("ShaderRegister", C.ShaderRegister); - - } +void MappingTraits::mapping(IO &IO, + dxbc::RootConstants &C) { + IO.mapRequired("Num32BitValues", C.Num32BitValues); + IO.mapRequired("RegisterSpace", C.RegisterSpace); + IO.mapRequired("ShaderRegister", C.ShaderRegister); +} -void MappingTraits::mapping( - IO &IO, dxbc::RootParameter &P) { - IO.mapRequired("ParameterType", P.ParameterType); - IO.mapRequired("ShaderVisibility", P.ShaderVisibility); - switch (P.ParameterType) { +void MappingTraits::mapping(IO &IO, + dxbc::RootParameter &P) { + IO.mapRequired("ParameterType", P.ParameterType); + IO.mapRequired("ShaderVisibility", P.ShaderVisibility); + switch (P.ParameterType) { - case dxbc::RootParameterType::Constants32Bit: - IO.mapRequired("Constants", P.Constants); + case dxbc::RootParameterType::Constants32Bit: + IO.mapRequired("Constants", P.Constants); - break; - } + break; } +} void MappingTraits::mapping(IO &IO, DXContainerYAML::Part &P) { @@ -343,15 +343,15 @@ void ScalarEnumerationTraits::enumeration( } void ScalarEnumerationTraits::enumeration( - IO &IO, dxbc::RootParameterType &Value) { -for (const auto &E : dxbc::getRootParameterTypes()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO &IO, dxbc::RootParameterType &Value) { + for (const auto &E : dxbc::getRootParameterTypes()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); } void ScalarEnumerationTraits::enumeration( - IO &IO, dxbc::ShaderVisibilityFlag &Value) { -for (const auto &E : dxbc::getShaderVisibilityFlags()) - IO.enumCase(Value, E.Name.str().c_str(), E.Value); + IO &IO, dxbc::ShaderVisibilityFlag &Value) { + for (const auto &E : dxbc::getShaderVisibilityFlags()) + IO.enumCase(Value, E.Name.str().c_str(), E.Value); } } // namespace yaml diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 104cd30d1fa75..10214fd30571f 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -201,12 +201,11 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, OS << indent(Space) << "Flags: " << format_hex(RS.Header.Flags, 8) << ":\n"; OS << indent(Space) << "Version: " << RS.Header.Version << ":\n"; OS << indent(Space) << "NumParameters: " << RS.Parameters.size() << ":\n"; - OS << indent(Space) << "RootParametersOffset: " << RS.Parameters.size_in_bytes() - << ":\n"; - OS << indent(Space) << "NumStaticSamplers: " << 0 - << ":\n"; - OS << indent(Space) << "StaticSamplersOffset: " << sizeof(RS.Header) + RS.Parameters.size_in_bytes() - << ":\n"; + OS << indent(Space) + << "RootParametersOffset: " << RS.Parameters.size_in_bytes() << ":\n"; + OS << indent(Space) << "NumStaticSamplers: " << 0 << ":\n"; + OS << indent(Space) << "StaticSamplersOffset: " + << sizeof(RS.Header) + RS.Parameters.size_in_bytes() << ":\n"; Space--; // end root signature header } diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index 3f5bb166ad0e5..035ea373d30ea 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -22,8 +22,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; DXC-NEXT: Size: 24 ; DXC-NEXT: RootSignature: ; DXC-NEXT: Version: 2 -; DXC-NEXT: NumParameters: 0 -; DXC-NEXT: RootParametersOffset: 0 ; DXC-NEXT: NumStaticSamplers: 0 -; DXC-NEXT: StaticSamplersOffset: 0 +; DXC-NEXT: StaticSamplersOffset: 24 +; DXC-NEXT: Parameters: [] ; DXC-NEXT: AllowInputAssemblerInputLayout: true diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll index 7adb17d0b022f..616ce38144095 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll @@ -29,7 +29,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 ; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: StaticSamplersOffset: 8 ; CHECK-LABEL: Definition for 'anotherMain': ; CHECK-NEXT: Flags: 0x000002 @@ -37,4 +37,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; CHECK-NEXT: NumParameters: 0 ; CHECK-NEXT: RootParametersOffset: 0 ; CHECK-NEXT: NumStaticSamplers: 0 -; CHECK-NEXT: StaticSamplersOffset: 0 +; CHECK-NEXT: StaticSamplersOffset: 8 diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index b0a3e6945f454..1b73b830f015f 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -14,10 +14,9 @@ Parts: Size: 24 RootSignature: Version: 2 - NumParameters: 1 - RootParametersOffset: 3 NumStaticSamplers: 4 StaticSamplersOffset: 5 + Parameters: [] AllowInputAssemblerInputLayout: true DenyGeometryShaderRootAccess: true @@ -25,9 +24,8 @@ Parts: # CHECK-NEXT: Size: 24 # CHECK-NEXT: RootSignature: # CHECK-NEXT: Version: 2 -# CHECK-NEXT: NumParameters: 1 -# CHECK-NEXT: RootParametersOffset: 3 -# CHECK-NEXT: NumStaticSamplers: 4 -# CHECK-NEXT: StaticSamplersOffset: 5 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 24 +# CHECK-NEXT: Parameters: [] # CHECK-NEXT: AllowInputAssemblerInputLayout: true # CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 943022bb4469b..73305350b0102 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -890,3 +890,32 @@ TEST(RootSignature, ParseRootFlags) { "unsupported root signature flag value read: 4278190081")); } } + +TEST(RootSignature, ParseRootConstant) { + { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<133>(Buffer))); + + const auto &RS = C.getRootSignature(); + ASSERT_TRUE(RS.has_value()); + ASSERT_EQ(RS->getVersion(), 2u); + ASSERT_EQ(RS->getNumParameters(), 1); + ASSERT_EQ(RS->getRootParametersOffset(), 24u); + ASSERT_EQ(RS->getNumStaticSamplers(), 0u); + ASSERT_EQ(RS->getStaticSamplersOffset(), 44u); + ASSERT_EQ(RS->getFlags(), 17u); + } +} diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b48cd9ce53987..b5fb8d143c0b9 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -127,10 +127,9 @@ TEST(RootSignature, ParseRootFlags) { Size: 24 RootSignature: Version: 2 - NumParameters: 0 - RootParametersOffset: 0 NumStaticSamplers: 0 StaticSamplersOffset: 0 + Parameters: [] AllowInputAssemblerInputLayout: true )")); @@ -139,10 +138,60 @@ TEST(RootSignature, ParseRootFlags) { 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; EXPECT_EQ(Storage.size(), 68u); EXPECT_TRUE(memcmp(Buffer, Storage.data(), 68u) == 0); } + +TEST(RootSignature, ParseRootConstants) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer + Header: + Hash: [ 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, 0x5, + 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1 ] + Version: + Major: 1 + Minor: 0 + FileSize: 133 + PartCount: 1 + PartOffsets: [ 36 ] + Parts: + - Name: RTS0 + Size: 89 + RootSignature: + Version: 2 + NumStaticSamplers: 0 + StaticSamplersOffset: 56 + Parameters: + - ParameterType: Constants32Bit + ShaderVisibility: Hull + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + + EXPECT_EQ(Storage.size(), 133u); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 133u) == 0); +} From 1026a8e10bfdf6ae0282307e0816191215cf45e5 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 19 Feb 2025 01:03:35 +0000 Subject: [PATCH 199/220] multiple parameters support and more testing --- llvm/include/llvm/BinaryFormat/DXContainer.h | 36 ++++++++-- .../BinaryFormat/DXContainerConstants.def | 3 + llvm/lib/MC/DXContainerRootSignature.cpp | 36 +++++----- llvm/lib/Object/DXContainer.cpp | 23 +++++-- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 4 ++ .../RootSignature-MultipleParameters.yaml | 55 ++++++++++++++++ llvm/unittests/Object/DXContainerTest.cpp | 65 +++++++++++++++++++ 7 files changed, 191 insertions(+), 31 deletions(-) create mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index a6bbf773ccc75..53214dafff7e7 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -14,11 +14,11 @@ #define LLVM_BINARYFORMAT_DXCONTAINER_H #include "llvm/ADT/StringRef.h" -#include "llvm/Support/BinaryStreamError.h" -#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" +#include #include namespace llvm { @@ -563,9 +563,11 @@ static_assert(sizeof(ProgramSignatureElement) == 32, "ProgramSignatureElement is misaligned"); struct RootConstants { - uint32_t ShaderRegister; - uint32_t RegisterSpace; - uint32_t Num32BitValues; + uint32_t ShaderRegister = 0; + uint32_t RegisterSpace = 0; + uint32_t Num32BitValues = 0; + + RootConstants() = default; void swapBytes() { sys::swapByteOrder(ShaderRegister); @@ -581,7 +583,14 @@ struct RootParameter { }; dxbc::ShaderVisibilityFlag ShaderVisibility; + RootParameter() { + Constants = RootConstants(); + ParameterType = dxbc::RootParameterType::Empty; + ShaderVisibility = dxbc::ShaderVisibilityFlag::Empty; + } + void swapBytes() { + sys::swapByteOrder(ParameterType); sys::swapByteOrder(ShaderVisibility); switch (ParameterType) { @@ -589,6 +598,9 @@ struct RootParameter { case RootParameterType::Constants32Bit: Constants.swapBytes(); break; + case RootParameterType::Empty: + llvm_unreachable("invalid value for ParameterType"); + break; } } }; @@ -596,8 +608,10 @@ struct RootParameter { struct RootSignatureHeader { uint32_t Version = 2; uint32_t Flags = 0; + void swapBytes() { - // nothing to swap + sys::swapByteOrder(Version); + sys::swapByteOrder(Flags); } }; @@ -608,6 +622,16 @@ struct RootSignatureValidations { static bool isValidVersion(uint32_t Version) { return (Version == 1 || Version == 2); } + + static bool isValidParameterType(dxbc::RootParameterType Flag) { + // RootParameterType::Empty is the higest value in the enum. + return Flag < dxbc::RootParameterType::Empty; + } + + static bool isValidShaderVisibility(dxbc::ShaderVisibilityFlag Flag) { + // ShaderVisibilityFlag::Empty is the higest value in the enum. + return Flag < dxbc::ShaderVisibilityFlag::Empty; + } }; } // namespace dxbc diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index e978d438a5f1c..6cdd5b8226162 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -75,6 +75,8 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) #ifdef ROOT_PARAMETER ROOT_PARAMETER(1, Constants32Bit) +ROOT_PARAMETER(5, Empty) + #undef ROOT_PARAMETER #endif // ROOT_PARAMETER @@ -87,6 +89,7 @@ SHADER_VISIBILITY(4, Geometry) SHADER_VISIBILITY(5, Pixel) SHADER_VISIBILITY(6, Amplification) SHADER_VISIBILITY(7, Mesh) +SHADER_VISIBILITY(8, Empty) #undef SHADER_VISIBILITY #endif // SHADER_VISIBILITY diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index f4295a6516285..a7f4b084f0714 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -10,47 +10,41 @@ #include "llvm/ADT/bit.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/EndianStream.h" +#include "llvm/Support/ErrorHandling.h" #include #include using namespace llvm; using namespace llvm::mcdxbc; -template static uint32_t getSizeOf() { - return static_cast(sizeof(T)); -} - void RootSignatureDesc::write(raw_ostream &OS) const { - uint32_t Offset = 16; - const uint32_t ParametersOffset = - getSizeOf() + Offset; + const uint32_t HeaderSize = 24; const uint32_t ParameterByteSize = Parameters.size_in_bytes(); // Writing header information support::endian::write(OS, Header.Version, llvm::endianness::little); - Offset += getSizeOf(); - support::endian::write(OS, (uint32_t)Parameters.size(), llvm::endianness::little); - Offset += getSizeOf(); - - support::endian::write(OS, ParametersOffset, llvm::endianness::little); - Offset += getSizeOf(); - + support::endian::write(OS, HeaderSize, llvm::endianness::little); support::endian::write(OS, ((uint32_t)0), llvm::endianness::little); - Offset += getSizeOf(); - - support::endian::write(OS, ParameterByteSize + ParametersOffset, + // TODO: this value means nothing right now... + support::endian::write(OS, ParameterByteSize + HeaderSize, llvm::endianness::little); - Offset += getSizeOf(); support::endian::write(OS, Header.Flags, llvm::endianness::little); + uint32_t ParamsOffset = + HeaderSize + (3 * sizeof(uint32_t) * Parameters.size()); for (const dxbc::RootParameter &P : Parameters) { support::endian::write(OS, P.ParameterType, llvm::endianness::little); support::endian::write(OS, P.ShaderVisibility, llvm::endianness::little); - support::endian::write(OS, Offset, llvm::endianness::little); + support::endian::write(OS, ParamsOffset, llvm::endianness::little); + + // Size of root parameter, removing the ParameterType and ShaderVisibility. + ParamsOffset += sizeof(dxbc::RootParameter) - 2 * sizeof(uint32_t); + } + for (const dxbc::RootParameter &P : Parameters) { switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { support::endian::write(OS, P.Constants.ShaderRegister, @@ -59,9 +53,9 @@ void RootSignatureDesc::write(raw_ostream &OS) const { llvm::endianness::little); support::endian::write(OS, P.Constants.Num32BitValues, llvm::endianness::little); - Offset += getSizeOf() + 3 * getSizeOf(); - } break; + case dxbc::RootParameterType::Empty: + llvm_unreachable("Invalid RootParameterType"); } } } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index c83cb98557638..daf44f8fe28fb 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -11,6 +11,7 @@ #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include #include @@ -295,14 +296,27 @@ Error DirectX::RootSignature::parse(StringRef Data) { for (uint32_t It = 0; It < NumParameters; It++) { dxbc::RootParameter NewParam; + dxbc::RootParameterType PTValue = + support::endian::read(Current); + if (!dxbc::RootSignatureValidations::isValidParameterType(PTValue)) + return validationFailed("unsupported parameter type value read: " + + llvm::Twine((uint32_t)PTValue)); + NewParam.ParameterType = support::endian::read(Current); Current += sizeof(dxbc::RootParameterType); - NewParam.ShaderVisibility = + dxbc::ShaderVisibilityFlag SVValue = support::endian::read(Current); + + if (!dxbc::RootSignatureValidations::isValidShaderVisibility(SVValue)) + return validationFailed("unsupported shader visility flag value read: " + + llvm::Twine((uint32_t)SVValue)); + + NewParam.ShaderVisibility = SVValue; Current += sizeof(dxbc::ShaderVisibilityFlag); uint32_t Offset = @@ -312,11 +326,12 @@ Error DirectX::RootSignature::parse(StringRef Data) { switch (NewParam.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { - if (Error Err = readStruct(Data, Current, NewParam.Constants)) + if (Error Err = readStruct(Data, Begin + Offset, NewParam.Constants)) return Err; - if (sys::IsBigEndianHost) - NewParam.Constants.swapBytes(); } break; + case dxbc::RootParameterType::Empty: + llvm_unreachable("Invalid value for RootParameterType"); + break; } Parameters.push_back(NewParam); diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index fc328a55733e7..bb5e43af56606 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -14,6 +14,7 @@ #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ScopedPrinter.h" namespace llvm { @@ -235,6 +236,9 @@ void MappingTraits::mapping(IO &IO, case dxbc::RootParameterType::Constants32Bit: IO.mapRequired("Constants", P.Constants); + break; + case dxbc::RootParameterType::Empty: + llvm_unreachable("Invalid value for ParameterType"); break; } } diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml new file mode 100644 index 0000000000000..bccfacb72819b --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml @@ -0,0 +1,55 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 80 + RootSignature: + Version: 2 + NumStaticSamplers: 0 + StaticSamplersOffset: 64 + Parameters: + - ParameterType: Constants32Bit + ShaderVisibility: Hull + Constants: + Num32BitValues: 16 + ShaderRegister: 15 + RegisterSpace: 14 + - ParameterType: Constants32Bit + ShaderVisibility: Geometry + Constants: + Num32BitValues: 21 + ShaderRegister: 22 + RegisterSpace: 23 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + +# CHECK: - Name: RTS0 +# CHECK-NEXT: Size: 80 +# CHECK-NEXT: RootSignature: +# CHECK-NEXT: Version: 2 +# CHECK-NEXT: NumStaticSamplers: 0 +# CHECK-NEXT: StaticSamplersOffset: 64 +# CHECK-NEXT: Parameters: +# CHECK-NEXT: - ParameterType: Constants32Bit +# CHECK-NEXT: ShaderVisibility: Hull +# CHECK-NEXT: Constants: +# CHECK-NEXT: Num32BitValues: 16 +# CHECK-NEXT: RegisterSpace: 14 +# CHECK-NEXT: ShaderRegister: 15 +# CHECK-NEXT: - ParameterType: Constants32Bit +# CHECK-NEXT: ShaderVisibility: Geometry +# CHECK-NEXT: Constants: +# CHECK-NEXT: Num32BitValues: 21 +# CHECK-NEXT: RegisterSpace: 23 +# CHECK-NEXT: ShaderRegister: 22 +# CHECK-NEXT: AllowInputAssemblerInputLayout: true +# CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 73305350b0102..c7a5e72585271 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -14,6 +14,7 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" +#include using namespace llvm; using namespace llvm::object; @@ -917,5 +918,69 @@ TEST(RootSignature, ParseRootConstant) { ASSERT_EQ(RS->getNumStaticSamplers(), 0u); ASSERT_EQ(RS->getStaticSamplersOffset(), 44u); ASSERT_EQ(RS->getFlags(), 17u); + + const auto RootParam = RS->getParameters()[0]; + ASSERT_EQ((uint32_t)RootParam.ParameterType, 1u); + ASSERT_EQ((uint32_t)RootParam.ShaderVisibility, 2u); + ASSERT_EQ(RootParam.Constants.ShaderRegister, 15u); + ASSERT_EQ(RootParam.Constants.RegisterSpace, 14u); + ASSERT_EQ(RootParam.Constants.Num32BitValues, 16u); + } + { + // ParameterType has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<133>(Buffer)), + FailedWithMessage("unsupported parameter type value read: 255")); + } + { + // ShaderVisibility has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<133>(Buffer)), + FailedWithMessage("unsupported shader visility flag value read: 255")); + } + { + // Offset has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00}; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<133>(Buffer)), + FailedWithMessage("Reading structure out of file bounds")); } } From 00175bf134320117a9483d7a9e66790f6922455b Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 19 Feb 2025 18:45:41 +0000 Subject: [PATCH 200/220] clean up --- llvm/include/llvm/BinaryFormat/DXContainer.h | 5 +---- .../llvm/BinaryFormat/DXContainerConstants.def | 2 +- llvm/include/llvm/MC/DXContainerRootSignature.h | 1 - llvm/include/llvm/Object/DXContainer.h | 1 - llvm/lib/MC/DXContainerRootSignature.cpp | 14 +++++++------- llvm/lib/Object/DXContainer.cpp | 4 ---- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 1 - llvm/lib/ObjectYAML/DXContainerYAML.cpp | 4 ---- llvm/unittests/Object/DXContainerTest.cpp | 1 - 9 files changed, 9 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 53214dafff7e7..e1081cf370871 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -14,11 +14,9 @@ #define LLVM_BINARYFORMAT_DXCONTAINER_H #include "llvm/ADT/StringRef.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" -#include #include namespace llvm { @@ -579,7 +577,7 @@ struct RootConstants { struct RootParameter { dxbc::RootParameterType ParameterType; union { - RootConstants Constants; + dxbc::RootConstants Constants; }; dxbc::ShaderVisibilityFlag ShaderVisibility; @@ -594,7 +592,6 @@ struct RootParameter { sys::swapByteOrder(ParameterType); sys::swapByteOrder(ShaderVisibility); switch (ParameterType) { - case RootParameterType::Constants32Bit: Constants.swapBytes(); break; diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 6cdd5b8226162..1bf3aa6096c1c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -76,11 +76,11 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) ROOT_PARAMETER(1, Constants32Bit) ROOT_PARAMETER(5, Empty) - #undef ROOT_PARAMETER #endif // ROOT_PARAMETER #ifdef SHADER_VISIBILITY + SHADER_VISIBILITY(0, All) SHADER_VISIBILITY(1, Vertex) SHADER_VISIBILITY(2, Hull) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 704490029c32f..1ef51a8f87aef 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,7 +15,6 @@ namespace llvm { class raw_ostream; namespace mcdxbc { - struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index ae8720d469d89..ddcc025c15460 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -22,7 +22,6 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" #include -#include #include namespace llvm { diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index a7f4b084f0714..bb9c3972806f4 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,27 +7,27 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" -#include "llvm/ADT/bit.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include -#include using namespace llvm; using namespace llvm::mcdxbc; void RootSignatureDesc::write(raw_ostream &OS) const { + // Root signature header in dxcontainer has 6 uint_32t values. const uint32_t HeaderSize = 24; const uint32_t ParameterByteSize = Parameters.size_in_bytes(); + const uint32_t NumParametes = Parameters.size(); + const uint32_t Zero = 0; // Writing header information support::endian::write(OS, Header.Version, llvm::endianness::little); - support::endian::write(OS, (uint32_t)Parameters.size(), - llvm::endianness::little); + support::endian::write(OS, NumParametes, llvm::endianness::little); support::endian::write(OS, HeaderSize, llvm::endianness::little); - support::endian::write(OS, ((uint32_t)0), llvm::endianness::little); - // TODO: this value means nothing right now... + + // Static samplers still not implemented + support::endian::write(OS, Zero, llvm::endianness::little); support::endian::write(OS, ParameterByteSize + HeaderSize, llvm::endianness::little); diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index daf44f8fe28fb..839329272f93c 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -11,10 +11,7 @@ #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" -#include -#include using namespace llvm; using namespace llvm::object; @@ -331,7 +328,6 @@ Error DirectX::RootSignature::parse(StringRef Data) { } break; case dxbc::RootParameterType::Empty: llvm_unreachable("Invalid value for RootParameterType"); - break; } Parameters.push_back(NewParam); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index eef2f0e8c366e..87ba16fd40ba9 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -19,7 +19,6 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" -#include using namespace llvm; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index bb5e43af56606..766bb36a23a87 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -14,7 +14,6 @@ #include "llvm/ObjectYAML/DXContainerYAML.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/DXContainer.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ScopedPrinter.h" namespace llvm { @@ -232,14 +231,11 @@ void MappingTraits::mapping(IO &IO, IO.mapRequired("ParameterType", P.ParameterType); IO.mapRequired("ShaderVisibility", P.ShaderVisibility); switch (P.ParameterType) { - case dxbc::RootParameterType::Constants32Bit: IO.mapRequired("Constants", P.Constants); - break; case dxbc::RootParameterType::Empty: llvm_unreachable("Invalid value for ParameterType"); - break; } } diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index c7a5e72585271..d4981293b029d 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -14,7 +14,6 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" -#include using namespace llvm; using namespace llvm::object; From 9ed2adc0aa3462ce7892362ac4c3c24f3336208c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 19 Feb 2025 22:54:36 +0000 Subject: [PATCH 201/220] fixing formating --- llvm/include/llvm/MC/DXContainerRootSignature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 1ef51a8f87aef..c6ddfccd0961d 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -38,6 +38,6 @@ struct RootParameter { dxbc::ShaderVisibilityFlag ShaderVisibility; void write(raw_ostream &OS); -}; +}; } // namespace mcdxbc } // namespace llvm From e8252ba0020a56fd5676ca4031b26b98abeb30c9 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 19 Feb 2025 23:05:10 +0000 Subject: [PATCH 202/220] reapply rebase fix --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 10214fd30571f..8702f0eecf2aa 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -192,9 +192,12 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, OS << "Root Signature Definitions" << "\n"; uint8_t Space = 0; - for (const auto &P : RSDMap) { - const auto &[Function, RSD] = P; - OS << "Definition for '" << Function->getName() << "':\n"; + for (const Function &F : M) { + auto It = RSDMap.find(&F); + if (It == RSDMap.end()) + continue; + const auto &RS = It->second; + OS << "Definition for '" << F.getName() << "':\n"; // start root signature header Space++; From 4de5c298011a361356180d60886d641571f31ca4 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Thu, 20 Feb 2025 19:35:17 +0000 Subject: [PATCH 203/220] clean up --- .../include/llvm/MC/DXContainerRootSignature.h | 18 ------------------ llvm/lib/MC/DXContainerRootSignature.cpp | 2 -- 2 files changed, 20 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index c6ddfccd0961d..b31b0da352038 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -21,23 +21,5 @@ struct RootSignatureDesc { void write(raw_ostream &OS) const; }; - -struct RootConstants { - uint32_t ShaderRegister; - uint32_t RegisterSpace; - uint32_t Num32BitValues; - - void write(raw_ostream &OS); -}; - -struct RootParameter { - dxbc::RootParameterType ParameterType; - union { - RootConstants Constants; - }; - dxbc::ShaderVisibilityFlag ShaderVisibility; - - void write(raw_ostream &OS); -}; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index bb9c3972806f4..35a4ef322d01e 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,9 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" -#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/EndianStream.h" -#include using namespace llvm; using namespace llvm::mcdxbc; From fe13b616cd5c98dbad8951e75098d2a4d3e34208 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 22 Feb 2025 05:53:24 +0000 Subject: [PATCH 204/220] addressing pr comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 20 +++++++---------- .../include/llvm/ObjectYAML/DXContainerYAML.h | 2 +- llvm/lib/BinaryFormat/DXContainer.cpp | 8 +++---- llvm/lib/Object/DXContainer.cpp | 22 ++++++++----------- llvm/lib/ObjectYAML/DXContainerYAML.cpp | 6 ++--- 5 files changed, 25 insertions(+), 33 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index e1081cf370871..5db665f77160c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -165,11 +165,11 @@ enum class RootParameterType : uint32_t { ArrayRef> getRootParameterTypes(); #define SHADER_VISIBILITY(Val, Enum) Enum = Val, -enum class ShaderVisibilityFlag : uint32_t { +enum class ShaderVisibility : uint32_t { #include "DXContainerConstants.def" }; -ArrayRef> getShaderVisibilityFlags(); +ArrayRef> getShaderVisibility(); PartType parsePartType(StringRef S); @@ -565,8 +565,6 @@ struct RootConstants { uint32_t RegisterSpace = 0; uint32_t Num32BitValues = 0; - RootConstants() = default; - void swapBytes() { sys::swapByteOrder(ShaderRegister); sys::swapByteOrder(RegisterSpace); @@ -579,16 +577,14 @@ struct RootParameter { union { dxbc::RootConstants Constants; }; - dxbc::ShaderVisibilityFlag ShaderVisibility; + dxbc::ShaderVisibility ShaderVisibility; RootParameter() { - Constants = RootConstants(); ParameterType = dxbc::RootParameterType::Empty; - ShaderVisibility = dxbc::ShaderVisibilityFlag::Empty; + ShaderVisibility = dxbc::ShaderVisibility::Empty; } void swapBytes() { - sys::swapByteOrder(ParameterType); sys::swapByteOrder(ShaderVisibility); switch (ParameterType) { @@ -620,14 +616,14 @@ struct RootSignatureValidations { return (Version == 1 || Version == 2); } - static bool isValidParameterType(dxbc::RootParameterType Flag) { + static bool isValidParameterType(dxbc::RootParameterType Type) { // RootParameterType::Empty is the higest value in the enum. - return Flag < dxbc::RootParameterType::Empty; + return Type < dxbc::RootParameterType::Empty; } - static bool isValidShaderVisibility(dxbc::ShaderVisibilityFlag Flag) { + static bool isValidShaderVisibility(dxbc::ShaderVisibility Visibility) { // ShaderVisibilityFlag::Empty is the higest value in the enum. - return Flag < dxbc::ShaderVisibilityFlag::Empty; + return Visibility < dxbc::ShaderVisibility::Empty; } }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 0bd5cf8ba5a60..f1c11379e1fb0 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -202,7 +202,7 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision) LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::RootParameterType) -LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::ShaderVisibilityFlag) +LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::ShaderVisibility) namespace llvm { diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index d1a1f1f544137..8e7b7d313706a 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -60,14 +60,14 @@ ArrayRef> dxbc::getSigComponentTypes() { return ArrayRef(SigComponentTypes); } -#define SHADER_VISIBILITY(Val, Enum) {#Enum, ShaderVisibilityFlag::Enum}, +#define SHADER_VISIBILITY(Val, Enum) {#Enum, ShaderVisibility::Enum}, -static const EnumEntry ShaderVisibilityFlags[] = { +static const EnumEntry ShaderVisibilityValues[] = { #include "llvm/BinaryFormat/DXContainerConstants.def" }; -ArrayRef> dxbc::getShaderVisibilityFlags() { - return ArrayRef(ShaderVisibilityFlags); +ArrayRef> dxbc::getShaderVisibility() { + return ArrayRef(ShaderVisibilityValues); } #define ROOT_PARAMETER(Val, Enum) {#Enum, RootParameterType::Enum}, diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 839329272f93c..010f70a952ebf 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -293,28 +293,23 @@ Error DirectX::RootSignature::parse(StringRef Data) { for (uint32_t It = 0; It < NumParameters; It++) { dxbc::RootParameter NewParam; - dxbc::RootParameterType PTValue = + NewParam.ParameterType = support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidParameterType(PTValue)) + if (!dxbc::RootSignatureValidations::isValidParameterType(NewParam.ParameterType)) return validationFailed("unsupported parameter type value read: " + - llvm::Twine((uint32_t)PTValue)); + llvm::Twine((uint32_t)NewParam.ParameterType)); - NewParam.ParameterType = - support::endian::read(Current); Current += sizeof(dxbc::RootParameterType); - dxbc::ShaderVisibilityFlag SVValue = - support::endian::read(Current); - - if (!dxbc::RootSignatureValidations::isValidShaderVisibility(SVValue)) + if (!dxbc::RootSignatureValidations::isValidShaderVisibility(NewParam.ShaderVisibility)) return validationFailed("unsupported shader visility flag value read: " + - llvm::Twine((uint32_t)SVValue)); + llvm::Twine((uint32_t)NewParam.ShaderVisibility)); - NewParam.ShaderVisibility = SVValue; - Current += sizeof(dxbc::ShaderVisibilityFlag); + Current += sizeof(dxbc::ShaderVisibility); uint32_t Offset = support::endian::read(Current); @@ -327,6 +322,7 @@ Error DirectX::RootSignature::parse(StringRef Data) { return Err; } break; case dxbc::RootParameterType::Empty: + // unreachable because it was validated and assigned before this point. llvm_unreachable("Invalid value for RootParameterType"); } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 766bb36a23a87..bab2cb550be73 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -348,9 +348,9 @@ void ScalarEnumerationTraits::enumeration( IO.enumCase(Value, E.Name.str().c_str(), E.Value); } -void ScalarEnumerationTraits::enumeration( - IO &IO, dxbc::ShaderVisibilityFlag &Value) { - for (const auto &E : dxbc::getShaderVisibilityFlags()) +void ScalarEnumerationTraits::enumeration( + IO &IO, dxbc::ShaderVisibility &Value) { + for (const auto &E : dxbc::getShaderVisibility()) IO.enumCase(Value, E.Name.str().c_str(), E.Value); } From 767b7d02bf71e6e0137e2b5f9f9b8d8b799ac81f Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 22 Feb 2025 06:19:15 +0000 Subject: [PATCH 205/220] first working version --- .../llvm/MC/DXContainerRootSignature.h | 5 +- llvm/lib/MC/DXContainerRootSignature.cpp | 90 +++++++++++++------ llvm/lib/Object/DXContainer.cpp | 10 ++- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 4 +- .../lib/Target/DirectX/DXContainerGlobals.cpp | 5 +- .../ContainerData/RootSignature-Flags.ll | 2 +- .../DXContainer/RootSignature-Flags.yaml | 2 +- .../RootSignature-MultipleParameters.yaml | 2 +- .../ObjectYAML/DXContainerYAMLTest.cpp | 8 +- 9 files changed, 85 insertions(+), 43 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index b31b0da352038..cae4f79aef56f 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -7,8 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/DXContainer.h" -#include -#include +#include "llvm/Support/raw_ostream.h" namespace llvm { @@ -19,7 +18,7 @@ struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; - void write(raw_ostream &OS) const; + Error write(raw_ostream &OS) const; }; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 35a4ef322d01e..9a279ab722877 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,53 +7,89 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" -#include "llvm/Support/EndianStream.h" +#include "llvm/Support/BinaryStreamWriter.h" +#include using namespace llvm; using namespace llvm::mcdxbc; -void RootSignatureDesc::write(raw_ostream &OS) const { - // Root signature header in dxcontainer has 6 uint_32t values. - const uint32_t HeaderSize = 24; - const uint32_t ParameterByteSize = Parameters.size_in_bytes(); - const uint32_t NumParametes = Parameters.size(); +Error RootSignatureDesc::write(raw_ostream &OS) const { + // Header Size + accounting for parameter offset + parameters size + std::vector Buffer(24 + (Parameters.size() * 4) + + Parameters.size_in_bytes()); + BinaryStreamWriter Writer(Buffer, llvm::endianness::little); + + SmallVector OffsetsToReplace; + SmallVector ValuesToReplaceOffsetsWith; + const uint32_t Dummy = std::numeric_limits::max(); + + const uint32_t NumParameters = Parameters.size(); const uint32_t Zero = 0; - // Writing header information - support::endian::write(OS, Header.Version, llvm::endianness::little); - support::endian::write(OS, NumParametes, llvm::endianness::little); - support::endian::write(OS, HeaderSize, llvm::endianness::little); + if (Error Err = Writer.writeInteger(Header.Version)) + return Err; + + if (Error Err = Writer.writeInteger(NumParameters)) + return Err; + + OffsetsToReplace.push_back(Writer.getOffset()); + if (Error Err = Writer.writeInteger(Dummy)) + return Err; // Static samplers still not implemented - support::endian::write(OS, Zero, llvm::endianness::little); - support::endian::write(OS, ParameterByteSize + HeaderSize, - llvm::endianness::little); + if (Error Err = Writer.writeInteger(Zero)) + return Err; + + if (Error Err = Writer.writeInteger(Zero)) + return Err; + + if (Error Err = Writer.writeInteger(Header.Flags)) + return Err; - support::endian::write(OS, Header.Flags, llvm::endianness::little); + ValuesToReplaceOffsetsWith.push_back(Writer.getOffset()); - uint32_t ParamsOffset = - HeaderSize + (3 * sizeof(uint32_t) * Parameters.size()); for (const dxbc::RootParameter &P : Parameters) { - support::endian::write(OS, P.ParameterType, llvm::endianness::little); - support::endian::write(OS, P.ShaderVisibility, llvm::endianness::little); - support::endian::write(OS, ParamsOffset, llvm::endianness::little); + if (Error Err = Writer.writeEnum(P.ParameterType)) + return Err; + if (Error Err = Writer.writeEnum(P.ShaderVisibility)) + return Err; - // Size of root parameter, removing the ParameterType and ShaderVisibility. - ParamsOffset += sizeof(dxbc::RootParameter) - 2 * sizeof(uint32_t); + OffsetsToReplace.push_back(Writer.getOffset()); + if (Error Err = Writer.writeInteger(Dummy)) + return Err; } for (const dxbc::RootParameter &P : Parameters) { + ValuesToReplaceOffsetsWith.push_back(Writer.getOffset()); switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { - support::endian::write(OS, P.Constants.ShaderRegister, - llvm::endianness::little); - support::endian::write(OS, P.Constants.RegisterSpace, - llvm::endianness::little); - support::endian::write(OS, P.Constants.Num32BitValues, - llvm::endianness::little); + if (Error Err = Writer.writeInteger(P.Constants.ShaderRegister)) + return Err; + if (Error Err = Writer.writeInteger(P.Constants.RegisterSpace)) + return Err; + if (Error Err = Writer.writeInteger(P.Constants.Num32BitValues)) + return Err; } break; case dxbc::RootParameterType::Empty: llvm_unreachable("Invalid RootParameterType"); } } + + assert(ValuesToReplaceOffsetsWith.size() == OffsetsToReplace.size() && + "Offset missing value to replace with."); + + for (size_t It = 0; It < ValuesToReplaceOffsetsWith.size(); It++) { + uint32_t Position = OffsetsToReplace[It]; + uint32_t Value = ValuesToReplaceOffsetsWith[It]; + + Writer.setOffset(Position); + if (Error Err = Writer.writeInteger(Value)) + return Err; + } + + llvm::ArrayRef BufferRef(reinterpret_cast(Buffer.data()), + Buffer.size()); + OS.write(BufferRef.data(), BufferRef.size()); + + return Error::success(); } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 010f70a952ebf..35261b661cf2f 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -296,16 +296,18 @@ Error DirectX::RootSignature::parse(StringRef Data) { NewParam.ParameterType = support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidParameterType(NewParam.ParameterType)) + if (!dxbc::RootSignatureValidations::isValidParameterType( + NewParam.ParameterType)) return validationFailed("unsupported parameter type value read: " + llvm::Twine((uint32_t)NewParam.ParameterType)); Current += sizeof(dxbc::RootParameterType); NewParam.ShaderVisibility = - support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidShaderVisibility(NewParam.ShaderVisibility)) + support::endian::read( + Current); + if (!dxbc::RootSignatureValidations::isValidShaderVisibility( + NewParam.ShaderVisibility)) return validationFailed("unsupported shader visility flag value read: " + llvm::Twine((uint32_t)NewParam.ShaderVisibility)); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 87ba16fd40ba9..a5831a69f9bca 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -271,7 +271,9 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { RS.Header.Version = P.RootSignature->Version; RS.Parameters = std::move(P.RootSignature->Parameters); - RS.write(OS); + if (Error Err = RS.write(OS)) + handleAllErrors(std::move(Err)); + break; } uint64_t BytesWritten = OS.tell() - DataStart; diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 5508af40663b1..5801046f83674 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -25,10 +25,12 @@ #include "llvm/InitializePasses.h" #include "llvm/MC/DXContainerPSVInfo.h" #include "llvm/Pass.h" +#include "llvm/Support/Error.h" #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include +#include using namespace llvm; using namespace llvm::dxil; @@ -173,7 +175,8 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallString<256> Data; raw_svector_ostream OS(Data); - RS.write(OS); + if (Error Err = RS.write(OS)) + handleAllErrors(std::move(Err)); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll index 035ea373d30ea..1ca6ebb7ddab8 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll @@ -23,6 +23,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ; DXC-NEXT: RootSignature: ; DXC-NEXT: Version: 2 ; DXC-NEXT: NumStaticSamplers: 0 -; DXC-NEXT: StaticSamplersOffset: 24 +; DXC-NEXT: StaticSamplersOffset: 0 ; DXC-NEXT: Parameters: [] ; DXC-NEXT: AllowInputAssemblerInputLayout: true diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml index 1b73b830f015f..0d7902afdaa66 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -25,7 +25,7 @@ Parts: # CHECK-NEXT: RootSignature: # CHECK-NEXT: Version: 2 # CHECK-NEXT: NumStaticSamplers: 0 -# CHECK-NEXT: StaticSamplersOffset: 24 +# CHECK-NEXT: StaticSamplersOffset: 0 # CHECK-NEXT: Parameters: [] # CHECK-NEXT: AllowInputAssemblerInputLayout: true # CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml index bccfacb72819b..8d5ab5c1b0b23 100644 --- a/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-MultipleParameters.yaml @@ -37,7 +37,7 @@ Parts: # CHECK-NEXT: RootSignature: # CHECK-NEXT: Version: 2 # CHECK-NEXT: NumStaticSamplers: 0 -# CHECK-NEXT: StaticSamplersOffset: 64 +# CHECK-NEXT: StaticSamplersOffset: 0 # CHECK-NEXT: Parameters: # CHECK-NEXT: - ParameterType: Constants32Bit # CHECK-NEXT: ShaderVisibility: Hull diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b5fb8d143c0b9..fed941f685272 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -134,12 +134,12 @@ TEST(RootSignature, ParseRootFlags) { )")); uint8_t Buffer[] = { - 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, - 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9a, 0x53, 0xd8, 0xec, 0xbe, 0x35, 0x6f, + 0x05, 0x39, 0xe1, 0xfe, 0x31, 0x20, 0xf0, 0xc1, 0x01, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }; EXPECT_EQ(Storage.size(), 68u); @@ -184,7 +184,7 @@ TEST(RootSignature, ParseRootConstants) { 0x85, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x2c, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, From 8434dc2f9d1d764191fa545210d452ba76a76909 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Sat, 22 Feb 2025 06:20:37 +0000 Subject: [PATCH 206/220] formating --- llvm/lib/Object/DXContainer.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 010f70a952ebf..35261b661cf2f 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -296,16 +296,18 @@ Error DirectX::RootSignature::parse(StringRef Data) { NewParam.ParameterType = support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidParameterType(NewParam.ParameterType)) + if (!dxbc::RootSignatureValidations::isValidParameterType( + NewParam.ParameterType)) return validationFailed("unsupported parameter type value read: " + llvm::Twine((uint32_t)NewParam.ParameterType)); Current += sizeof(dxbc::RootParameterType); NewParam.ShaderVisibility = - support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidShaderVisibility(NewParam.ShaderVisibility)) + support::endian::read( + Current); + if (!dxbc::RootSignatureValidations::isValidShaderVisibility( + NewParam.ShaderVisibility)) return validationFailed("unsupported shader visility flag value read: " + llvm::Twine((uint32_t)NewParam.ShaderVisibility)); From d39172781e9f7e882de78d2e24a7bd1bd8c48688 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 24 Feb 2025 20:57:23 +0000 Subject: [PATCH 207/220] moving the offset logic to it's own class --- .../llvm/MC/DXContainerRootSignature.h | 24 ++++++ llvm/lib/MC/DXContainerRootSignature.cpp | 83 +++++++++++++------ 2 files changed, 81 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index cae4f79aef56f..ca958c0780cdd 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -7,18 +7,42 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Support/BinaryStreamWriter.h" #include "llvm/Support/raw_ostream.h" +#include +#include namespace llvm { class raw_ostream; namespace mcdxbc { + +class StreamOffsetHelper { +private: + std::map> OffsetsMaping; + BinaryStreamWriter &Stream; + +public: + explicit StreamOffsetHelper(BinaryStreamWriter &Stream) : Stream(Stream) {} + + Error addOffset(std::string Key); + + void addRewriteValue(std::string Key); + + Error rewrite(); +}; + struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; Error write(raw_ostream &OS) const; + + uint32_t getSizeInBytes() const { + // Header Size + accounting for parameter offset + parameters size + return 24 + (Parameters.size() * 4) + Parameters.size_in_bytes(); + } }; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 9a279ab722877..828785980c15d 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,21 +7,55 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" +#include "llvm/ADT/Twine.h" #include "llvm/Support/BinaryStreamWriter.h" -#include using namespace llvm; using namespace llvm::mcdxbc; +Error StreamOffsetHelper::addOffset(std::string Key) { + const uint32_t DummyValue = std::numeric_limits::max(); + + uint32_t Offset = Stream.getOffset(); + auto Value = std::make_pair(Offset, DummyValue); + + OffsetsMaping.insert_or_assign(Key, Value); + + if (Error Err = Stream.writeInteger(DummyValue)) + return Err; + + return Error::success(); +} + +void StreamOffsetHelper::addRewriteValue(std::string Key) { + auto It = OffsetsMaping.find(Key); + assert(It != OffsetsMaping.end() && "Offset address was not found."); + auto [Offset, _] = It->second; + + uint32_t Value = Stream.getOffset(); + + std::pair NewValue = std::make_pair(Offset, Value); + OffsetsMaping.insert_or_assign(Key, NewValue); +} + +Error StreamOffsetHelper::rewrite() { + for (auto &[Key, RewriteInfo] : OffsetsMaping) { + auto [Position, Value] = RewriteInfo; + assert(Value != std::numeric_limits::max()); + + Stream.setOffset(Position); + if (Error Err = Stream.writeInteger(Value)) + return Err; + } + + return Error::success(); +} + Error RootSignatureDesc::write(raw_ostream &OS) const { - // Header Size + accounting for parameter offset + parameters size - std::vector Buffer(24 + (Parameters.size() * 4) + - Parameters.size_in_bytes()); + std::vector Buffer(getSizeInBytes()); BinaryStreamWriter Writer(Buffer, llvm::endianness::little); - SmallVector OffsetsToReplace; - SmallVector ValuesToReplaceOffsetsWith; - const uint32_t Dummy = std::numeric_limits::max(); + StreamOffsetHelper OffsetMap(Writer); const uint32_t NumParameters = Parameters.size(); const uint32_t Zero = 0; @@ -32,8 +66,7 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { if (Error Err = Writer.writeInteger(NumParameters)) return Err; - OffsetsToReplace.push_back(Writer.getOffset()); - if (Error Err = Writer.writeInteger(Dummy)) + if (Error Err = OffsetMap.addOffset("header")) return Err; // Static samplers still not implemented @@ -46,21 +79,28 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { if (Error Err = Writer.writeInteger(Header.Flags)) return Err; - ValuesToReplaceOffsetsWith.push_back(Writer.getOffset()); + OffsetMap.addRewriteValue("header"); + + for (size_t It = 0; It < Parameters.size(); It++) { + const auto &P = Parameters[It]; - for (const dxbc::RootParameter &P : Parameters) { if (Error Err = Writer.writeEnum(P.ParameterType)) return Err; + if (Error Err = Writer.writeEnum(P.ShaderVisibility)) return Err; - OffsetsToReplace.push_back(Writer.getOffset()); - if (Error Err = Writer.writeInteger(Dummy)) + std::string Key = ("parameters" + Twine(It)).str(); + if (Error Err = OffsetMap.addOffset(Key)) return Err; } - for (const dxbc::RootParameter &P : Parameters) { - ValuesToReplaceOffsetsWith.push_back(Writer.getOffset()); + for (size_t It = 0; It < Parameters.size(); It++) { + const auto &P = Parameters[It]; + + std::string Key = ("parameters" + Twine(It)).str(); + OffsetMap.addRewriteValue(Key); + switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { if (Error Err = Writer.writeInteger(P.Constants.ShaderRegister)) @@ -75,17 +115,8 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { } } - assert(ValuesToReplaceOffsetsWith.size() == OffsetsToReplace.size() && - "Offset missing value to replace with."); - - for (size_t It = 0; It < ValuesToReplaceOffsetsWith.size(); It++) { - uint32_t Position = OffsetsToReplace[It]; - uint32_t Value = ValuesToReplaceOffsetsWith[It]; - - Writer.setOffset(Position); - if (Error Err = Writer.writeInteger(Value)) - return Err; - } + if (Error Err = OffsetMap.rewrite()) + return Err; llvm::ArrayRef BufferRef(reinterpret_cast(Buffer.data()), Buffer.size()); From 68c7513d27ee9d5ada3851230b4dcd9611e88f02 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Mon, 24 Feb 2025 22:39:34 +0000 Subject: [PATCH 208/220] refactoring to remove use of map and string --- .../llvm/MC/DXContainerRootSignature.h | 18 +---- llvm/lib/MC/DXContainerRootSignature.cpp | 65 ++++++++----------- 2 files changed, 28 insertions(+), 55 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index ca958c0780cdd..911ff53c15fae 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/IndexedMap.h" #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Support/BinaryStreamWriter.h" #include "llvm/Support/raw_ostream.h" -#include -#include namespace llvm { @@ -18,21 +17,6 @@ class raw_ostream; namespace mcdxbc { -class StreamOffsetHelper { -private: - std::map> OffsetsMaping; - BinaryStreamWriter &Stream; - -public: - explicit StreamOffsetHelper(BinaryStreamWriter &Stream) : Stream(Stream) {} - - Error addOffset(std::string Key); - - void addRewriteValue(std::string Key); - - Error rewrite(); -}; - struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 828785980c15d..abcaa66aff380 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,19 +7,19 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/BinaryStreamWriter.h" +#include +#include using namespace llvm; using namespace llvm::mcdxbc; -Error StreamOffsetHelper::addOffset(std::string Key) { +Error setRewrite(BinaryStreamWriter &Stream, uint32_t &Offset) { const uint32_t DummyValue = std::numeric_limits::max(); - uint32_t Offset = Stream.getOffset(); - auto Value = std::make_pair(Offset, DummyValue); - - OffsetsMaping.insert_or_assign(Key, Value); + Offset = Stream.getOffset(); if (Error Err = Stream.writeInteger(DummyValue)) return Err; @@ -27,26 +27,13 @@ Error StreamOffsetHelper::addOffset(std::string Key) { return Error::success(); } -void StreamOffsetHelper::addRewriteValue(std::string Key) { - auto It = OffsetsMaping.find(Key); - assert(It != OffsetsMaping.end() && "Offset address was not found."); - auto [Offset, _] = It->second; - - uint32_t Value = Stream.getOffset(); - - std::pair NewValue = std::make_pair(Offset, Value); - OffsetsMaping.insert_or_assign(Key, NewValue); -} - -Error StreamOffsetHelper::rewrite() { - for (auto &[Key, RewriteInfo] : OffsetsMaping) { - auto [Position, Value] = RewriteInfo; - assert(Value != std::numeric_limits::max()); +Error rewriteOffset(BinaryStreamWriter &Stream, uint32_t Offset) { + uint64_t Value = Stream.getOffset(); + Stream.setOffset(Offset); + if (Error Err = Stream.writeInteger((uint32_t)Value)) + return Err; - Stream.setOffset(Position); - if (Error Err = Stream.writeInteger(Value)) - return Err; - } + Stream.setOffset(Value); return Error::success(); } @@ -55,8 +42,6 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { std::vector Buffer(getSizeInBytes()); BinaryStreamWriter Writer(Buffer, llvm::endianness::little); - StreamOffsetHelper OffsetMap(Writer); - const uint32_t NumParameters = Parameters.size(); const uint32_t Zero = 0; @@ -66,7 +51,8 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { if (Error Err = Writer.writeInteger(NumParameters)) return Err; - if (Error Err = OffsetMap.addOffset("header")) + uint32_t HeaderPoint; + if (Error Err = setRewrite(Writer, HeaderPoint)) return Err; // Static samplers still not implemented @@ -79,10 +65,11 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { if (Error Err = Writer.writeInteger(Header.Flags)) return Err; - OffsetMap.addRewriteValue("header"); + if (Error Err = rewriteOffset(Writer, HeaderPoint)) + return Err; - for (size_t It = 0; It < Parameters.size(); It++) { - const auto &P = Parameters[It]; + SmallVector ParamsOffset; + for (const auto &P : Parameters) { if (Error Err = Writer.writeEnum(P.ParameterType)) return Err; @@ -90,16 +77,19 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { if (Error Err = Writer.writeEnum(P.ShaderVisibility)) return Err; - std::string Key = ("parameters" + Twine(It)).str(); - if (Error Err = OffsetMap.addOffset(Key)) + uint32_t Offset; + if (Error Err = setRewrite(Writer, Offset)) return Err; + ParamsOffset.push_back(Offset); } - for (size_t It = 0; It < Parameters.size(); It++) { - const auto &P = Parameters[It]; + size_t It = 0; + for (const auto &P : Parameters) { - std::string Key = ("parameters" + Twine(It)).str(); - OffsetMap.addRewriteValue(Key); + auto Offset = ParamsOffset[It]; + if (Error Err = rewriteOffset(Writer, Offset)) + return Err; + It++; switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { @@ -115,8 +105,7 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { } } - if (Error Err = OffsetMap.rewrite()) - return Err; + assert(It == NumParameters); llvm::ArrayRef BufferRef(reinterpret_cast(Buffer.data()), Buffer.size()); From 23069ab6def6089a74ce50f3d1b42c896b13f623 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 04:25:33 +0000 Subject: [PATCH 209/220] addressing comments --- llvm/lib/MC/DXContainerRootSignature.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index abcaa66aff380..5cdd7726e6e3f 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -83,13 +83,12 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { ParamsOffset.push_back(Offset); } - size_t It = 0; - for (const auto &P : Parameters) { - - auto Offset = ParamsOffset[It]; - if (Error Err = rewriteOffset(Writer, Offset)) + assert(NumParameters == ParamsOffset.size()); + for (size_t I = 0; I < NumParameters; ++I) { + if (Error Err = rewriteOffset(Writer, ParamsOffset[I])) return Err; - It++; + + const auto &P = Parameters[I]; switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { @@ -105,8 +104,6 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { } } - assert(It == NumParameters); - llvm::ArrayRef BufferRef(reinterpret_cast(Buffer.data()), Buffer.size()); OS.write(BufferRef.data(), BufferRef.size()); From d14471b11c7eaaa12870982229cc804946cc59fb Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 18:52:33 +0000 Subject: [PATCH 210/220] using buffer_ostream --- .../llvm/MC/DXContainerRootSignature.h | 2 - llvm/lib/MC/DXContainerRootSignature.cpp | 90 ++++++------------- 2 files changed, 29 insertions(+), 63 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 911ff53c15fae..9e8c7ee54c148 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -6,9 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/IndexedMap.h" #include "llvm/BinaryFormat/DXContainer.h" -#include "llvm/Support/BinaryStreamWriter.h" #include "llvm/Support/raw_ostream.h" namespace llvm { diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 5cdd7726e6e3f..d66b9f7776a71 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -9,104 +9,72 @@ #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" -#include "llvm/Support/BinaryStreamWriter.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/Error.h" #include -#include using namespace llvm; using namespace llvm::mcdxbc; -Error setRewrite(BinaryStreamWriter &Stream, uint32_t &Offset) { +void setRewrite(raw_ostream &Stream, uint32_t &Offset) { const uint32_t DummyValue = std::numeric_limits::max(); - - Offset = Stream.getOffset(); - - if (Error Err = Stream.writeInteger(DummyValue)) - return Err; - - return Error::success(); + Offset = Stream.tell(); + support::endian::write(Stream, DummyValue, llvm::endianness::little); } -Error rewriteOffset(BinaryStreamWriter &Stream, uint32_t Offset) { - uint64_t Value = Stream.getOffset(); - Stream.setOffset(Offset); - if (Error Err = Stream.writeInteger((uint32_t)Value)) - return Err; - - Stream.setOffset(Value); - - return Error::success(); +void rewriteOffset(buffer_ostream &Stream, uint32_t Offset) { + uint32_t Value = Stream.tell(); + auto *InsertPoint = &Stream.buffer()[Offset]; + support::endian::write(InsertPoint, Value, llvm::endianness::little); } Error RootSignatureDesc::write(raw_ostream &OS) const { - std::vector Buffer(getSizeInBytes()); - BinaryStreamWriter Writer(Buffer, llvm::endianness::little); - + buffer_ostream Writer(OS); const uint32_t NumParameters = Parameters.size(); const uint32_t Zero = 0; - if (Error Err = Writer.writeInteger(Header.Version)) - return Err; - - if (Error Err = Writer.writeInteger(NumParameters)) - return Err; + support::endian::write(Writer, Header.Version, llvm::endianness::little); + support::endian::write(Writer, NumParameters, llvm::endianness::little); uint32_t HeaderPoint; - if (Error Err = setRewrite(Writer, HeaderPoint)) - return Err; - - // Static samplers still not implemented - if (Error Err = Writer.writeInteger(Zero)) - return Err; + setRewrite(Writer, HeaderPoint); - if (Error Err = Writer.writeInteger(Zero)) - return Err; + support::endian::write(Writer, Zero, llvm::endianness::little); + support::endian::write(Writer, Zero, llvm::endianness::little); + support::endian::write(Writer, Header.Flags, llvm::endianness::little); - if (Error Err = Writer.writeInteger(Header.Flags)) - return Err; - - if (Error Err = rewriteOffset(Writer, HeaderPoint)) - return Err; + rewriteOffset(Writer, HeaderPoint); SmallVector ParamsOffset; for (const auto &P : Parameters) { - - if (Error Err = Writer.writeEnum(P.ParameterType)) - return Err; - - if (Error Err = Writer.writeEnum(P.ShaderVisibility)) - return Err; + support::endian::write(Writer, P.ParameterType, llvm::endianness::little); + support::endian::write(Writer, P.ShaderVisibility, + llvm::endianness::little); uint32_t Offset; - if (Error Err = setRewrite(Writer, Offset)) - return Err; + setRewrite(Writer, Offset); + ParamsOffset.push_back(Offset); } assert(NumParameters == ParamsOffset.size()); for (size_t I = 0; I < NumParameters; ++I) { - if (Error Err = rewriteOffset(Writer, ParamsOffset[I])) - return Err; - + rewriteOffset(Writer, ParamsOffset[I]); const auto &P = Parameters[I]; switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { - if (Error Err = Writer.writeInteger(P.Constants.ShaderRegister)) - return Err; - if (Error Err = Writer.writeInteger(P.Constants.RegisterSpace)) - return Err; - if (Error Err = Writer.writeInteger(P.Constants.Num32BitValues)) - return Err; + support::endian::write(Writer, P.Constants.ShaderRegister, + llvm::endianness::little); + support::endian::write(Writer, P.Constants.RegisterSpace, + llvm::endianness::little); + support::endian::write(Writer, P.Constants.Num32BitValues, + llvm::endianness::little); } break; case dxbc::RootParameterType::Empty: llvm_unreachable("Invalid RootParameterType"); } } - llvm::ArrayRef BufferRef(reinterpret_cast(Buffer.data()), - Buffer.size()); - OS.write(BufferRef.data(), BufferRef.size()); - return Error::success(); } From 216341ca3457f9da59cecc6f400e75f88ce51e90 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 18:54:26 +0000 Subject: [PATCH 211/220] remove getsize --- llvm/include/llvm/MC/DXContainerRootSignature.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 9e8c7ee54c148..d2dfb77087212 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -20,11 +20,6 @@ struct RootSignatureDesc { SmallVector Parameters; Error write(raw_ostream &OS) const; - - uint32_t getSizeInBytes() const { - // Header Size + accounting for parameter offset + parameters size - return 24 + (Parameters.size() * 4) + Parameters.size_in_bytes(); - } }; } // namespace mcdxbc } // namespace llvm From 85f012c3e9f05f91386d4c4731cbe98fadeabe3c Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 19:16:56 +0000 Subject: [PATCH 212/220] clean up --- llvm/include/llvm/MC/DXContainerRootSignature.h | 4 +--- llvm/lib/MC/DXContainerRootSignature.cpp | 8 +------- llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 3 +-- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 5 +---- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index d2dfb77087212..89e8a6c6c1834 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -7,19 +7,17 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/DXContainer.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { class raw_ostream; namespace mcdxbc { - struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; - Error write(raw_ostream &OS) const; + void write(raw_ostream &OS) const; }; } // namespace mcdxbc } // namespace llvm diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index d66b9f7776a71..1a08d92c72cab 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -7,11 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/DXContainerRootSignature.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Twine.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/Error.h" -#include using namespace llvm; using namespace llvm::mcdxbc; @@ -28,7 +24,7 @@ void rewriteOffset(buffer_ostream &Stream, uint32_t Offset) { support::endian::write(InsertPoint, Value, llvm::endianness::little); } -Error RootSignatureDesc::write(raw_ostream &OS) const { +void RootSignatureDesc::write(raw_ostream &OS) const { buffer_ostream Writer(OS); const uint32_t NumParameters = Parameters.size(); const uint32_t Zero = 0; @@ -75,6 +71,4 @@ Error RootSignatureDesc::write(raw_ostream &OS) const { llvm_unreachable("Invalid RootParameterType"); } } - - return Error::success(); } diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index a5831a69f9bca..4f275fb58adc1 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -271,8 +271,7 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { RS.Header.Version = P.RootSignature->Version; RS.Parameters = std::move(P.RootSignature->Parameters); - if (Error Err = RS.write(OS)) - handleAllErrors(std::move(Err)); + RS.write(OS); break; } diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 5801046f83674..5508af40663b1 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -25,12 +25,10 @@ #include "llvm/InitializePasses.h" #include "llvm/MC/DXContainerPSVInfo.h" #include "llvm/Pass.h" -#include "llvm/Support/Error.h" #include "llvm/Support/MD5.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include -#include using namespace llvm; using namespace llvm::dxil; @@ -175,8 +173,7 @@ void DXContainerGlobals::addRootSignature(Module &M, SmallString<256> Data; raw_svector_ostream OS(Data); - if (Error Err = RS.write(OS)) - handleAllErrors(std::move(Err)); + RS.write(OS); Constant *Constant = ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false); From 1e2bcf54b775f792fdf61818698a15932601b7cf Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 19:17:51 +0000 Subject: [PATCH 213/220] clean up --- llvm/include/llvm/MC/DXContainerRootSignature.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 89e8a6c6c1834..b31b0da352038 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/BinaryFormat/DXContainer.h" +#include +#include namespace llvm { From 0e277d96f18808648a299195f436fc0216c4e155 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 19:19:51 +0000 Subject: [PATCH 214/220] clean up --- llvm/lib/Object/DXContainer.cpp | 10 ++++------ llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 35261b661cf2f..8d955d01ce202 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -296,18 +296,16 @@ Error DirectX::RootSignature::parse(StringRef Data) { NewParam.ParameterType = support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidParameterType( - NewParam.ParameterType)) + if (!dxbc::RootSignatureValidations::isValidParameterType(NewParam.ParameterType)) return validationFailed("unsupported parameter type value read: " + llvm::Twine((uint32_t)NewParam.ParameterType)); Current += sizeof(dxbc::RootParameterType); NewParam.ShaderVisibility = - support::endian::read( - Current); - if (!dxbc::RootSignatureValidations::isValidShaderVisibility( - NewParam.ShaderVisibility)) + support::endian::read(Current); + if (!dxbc::RootSignatureValidations::isValidShaderVisibility(NewParam.ShaderVisibility)) return validationFailed("unsupported shader visility flag value read: " + llvm::Twine((uint32_t)NewParam.ShaderVisibility)); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 4f275fb58adc1..87ba16fd40ba9 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -272,7 +272,6 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { RS.Parameters = std::move(P.RootSignature->Parameters); RS.write(OS); - break; } uint64_t BytesWritten = OS.tell() - DataStart; From 5cd0044cf3ae363b9adb129029b2d1d728e111ce Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 19:20:35 +0000 Subject: [PATCH 215/220] clean up --- llvm/lib/Object/DXContainer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 8d955d01ce202..010f70a952ebf 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -303,7 +303,7 @@ Error DirectX::RootSignature::parse(StringRef Data) { Current += sizeof(dxbc::RootParameterType); NewParam.ShaderVisibility = - support::endian::read(Current); if (!dxbc::RootSignatureValidations::isValidShaderVisibility(NewParam.ShaderVisibility)) return validationFailed("unsupported shader visility flag value read: " + From 7a7c34d5976359975d44aed6495b34bfb0beb244 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 22:35:38 +0000 Subject: [PATCH 216/220] addressing pr comments --- llvm/lib/MC/DXContainerRootSignature.cpp | 54 ++++++++++++------------ llvm/lib/Object/DXContainer.cpp | 10 +++-- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 1a08d92c72cab..d90ee510fddf3 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -8,63 +8,61 @@ #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Support/EndianStream.h" +#include using namespace llvm; using namespace llvm::mcdxbc; -void setRewrite(raw_ostream &Stream, uint32_t &Offset) { +static uint32_t writePlaceholder(raw_ostream &Stream) { const uint32_t DummyValue = std::numeric_limits::max(); - Offset = Stream.tell(); + uint32_t Offset = Stream.tell(); support::endian::write(Stream, DummyValue, llvm::endianness::little); + return Offset; } -void rewriteOffset(buffer_ostream &Stream, uint32_t Offset) { - uint32_t Value = Stream.tell(); - auto *InsertPoint = &Stream.buffer()[Offset]; - support::endian::write(InsertPoint, Value, llvm::endianness::little); +static void rewriteOffset(buffer_ostream &Stream, uint32_t Offset) { + uint32_t Value = + support::endian::byte_swap( + Stream.tell()); + Stream.pwrite(reinterpret_cast(&Value), sizeof(Value), Offset); } void RootSignatureDesc::write(raw_ostream &OS) const { - buffer_ostream Writer(OS); + buffer_ostream BOS(OS); const uint32_t NumParameters = Parameters.size(); const uint32_t Zero = 0; - support::endian::write(Writer, Header.Version, llvm::endianness::little); - support::endian::write(Writer, NumParameters, llvm::endianness::little); + support::endian::write(BOS, Header.Version, llvm::endianness::little); + support::endian::write(BOS, NumParameters, llvm::endianness::little); - uint32_t HeaderPoint; - setRewrite(Writer, HeaderPoint); + uint32_t HeaderPoint = writePlaceholder(BOS); - support::endian::write(Writer, Zero, llvm::endianness::little); - support::endian::write(Writer, Zero, llvm::endianness::little); - support::endian::write(Writer, Header.Flags, llvm::endianness::little); + support::endian::write(BOS, Zero, llvm::endianness::little); + support::endian::write(BOS, Zero, llvm::endianness::little); + support::endian::write(BOS, Header.Flags, llvm::endianness::little); - rewriteOffset(Writer, HeaderPoint); + rewriteOffset(BOS, HeaderPoint); - SmallVector ParamsOffset; + SmallVector ParamsOffsets; for (const auto &P : Parameters) { - support::endian::write(Writer, P.ParameterType, llvm::endianness::little); - support::endian::write(Writer, P.ShaderVisibility, - llvm::endianness::little); + support::endian::write(BOS, P.ParameterType, llvm::endianness::little); + support::endian::write(BOS, P.ShaderVisibility, llvm::endianness::little); - uint32_t Offset; - setRewrite(Writer, Offset); - - ParamsOffset.push_back(Offset); + ParamsOffsets.push_back(writePlaceholder(BOS)); } - assert(NumParameters == ParamsOffset.size()); + assert(NumParameters == ParamsOffsets.size()); for (size_t I = 0; I < NumParameters; ++I) { - rewriteOffset(Writer, ParamsOffset[I]); + rewriteOffset(BOS, ParamsOffsets[I]); const auto &P = Parameters[I]; switch (P.ParameterType) { case dxbc::RootParameterType::Constants32Bit: { - support::endian::write(Writer, P.Constants.ShaderRegister, + support::endian::write(BOS, P.Constants.ShaderRegister, llvm::endianness::little); - support::endian::write(Writer, P.Constants.RegisterSpace, + support::endian::write(BOS, P.Constants.RegisterSpace, llvm::endianness::little); - support::endian::write(Writer, P.Constants.Num32BitValues, + support::endian::write(BOS, P.Constants.Num32BitValues, llvm::endianness::little); } break; case dxbc::RootParameterType::Empty: diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 010f70a952ebf..35261b661cf2f 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -296,16 +296,18 @@ Error DirectX::RootSignature::parse(StringRef Data) { NewParam.ParameterType = support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidParameterType(NewParam.ParameterType)) + if (!dxbc::RootSignatureValidations::isValidParameterType( + NewParam.ParameterType)) return validationFailed("unsupported parameter type value read: " + llvm::Twine((uint32_t)NewParam.ParameterType)); Current += sizeof(dxbc::RootParameterType); NewParam.ShaderVisibility = - support::endian::read(Current); - if (!dxbc::RootSignatureValidations::isValidShaderVisibility(NewParam.ShaderVisibility)) + support::endian::read( + Current); + if (!dxbc::RootSignatureValidations::isValidShaderVisibility( + NewParam.ShaderVisibility)) return validationFailed("unsupported shader visility flag value read: " + llvm::Twine((uint32_t)NewParam.ShaderVisibility)); From d3fafab79d4964ea366af87cbb4c644594f09f93 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 25 Feb 2025 22:40:05 +0000 Subject: [PATCH 217/220] clean up --- llvm/lib/MC/DXContainerRootSignature.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index d90ee510fddf3..1db8f55a31658 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -8,7 +8,6 @@ #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Support/EndianStream.h" -#include using namespace llvm; using namespace llvm::mcdxbc; From 74856401627958970d033598e49b3eef59f40cc0 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 26 Feb 2025 00:55:19 +0000 Subject: [PATCH 218/220] clean up --- llvm/include/llvm/BinaryFormat/DXContainer.h | 10 +++++----- llvm/unittests/Object/DXContainerTest.cpp | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 5db665f77160c..af65587630105 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -561,9 +561,9 @@ static_assert(sizeof(ProgramSignatureElement) == 32, "ProgramSignatureElement is misaligned"); struct RootConstants { - uint32_t ShaderRegister = 0; - uint32_t RegisterSpace = 0; - uint32_t Num32BitValues = 0; + uint32_t ShaderRegister; + uint32_t RegisterSpace; + uint32_t Num32BitValues; void swapBytes() { sys::swapByteOrder(ShaderRegister); @@ -599,8 +599,8 @@ struct RootParameter { }; struct RootSignatureHeader { - uint32_t Version = 2; - uint32_t Flags = 0; + uint32_t Version; + uint32_t Flags; void swapBytes() { sys::swapByteOrder(Version); diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index d4981293b029d..8199397854384 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -837,12 +837,12 @@ TEST(RootSignature, ParseRootFlags) { const auto &RS = C.getRootSignature(); ASSERT_TRUE(RS.has_value()); - ASSERT_EQ(RS->getVersion(), 2); - ASSERT_EQ(RS->getNumParameters(), 0); - ASSERT_EQ(RS->getRootParametersOffset(), 0); - ASSERT_EQ(RS->getNumStaticSamplers(), 0); - ASSERT_EQ(RS->getStaticSamplersOffset(), 0); - ASSERT_EQ(RS->getFlags(), 0x01); + ASSERT_EQ(RS->getVersion(), 2u); + ASSERT_EQ(RS->getNumParameters(), 0u); + ASSERT_EQ(RS->getRootParametersOffset(), 0u); + ASSERT_EQ(RS->getNumStaticSamplers(), 0u); + ASSERT_EQ(RS->getStaticSamplersOffset(), 0u); + ASSERT_EQ(RS->getFlags(), 0x01u); } { From 17abc8297e956c97585e5e96738f7221e4a33b36 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 26 Feb 2025 01:08:59 +0000 Subject: [PATCH 219/220] moving initializer arround --- llvm/include/llvm/MC/DXContainerRootSignature.h | 2 ++ llvm/include/llvm/Object/DXContainer.h | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index b31b0da352038..02da4eeae3df2 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -16,8 +16,10 @@ class raw_ostream; namespace mcdxbc { struct RootSignatureDesc { + dxbc::RootSignatureHeader Header; SmallVector Parameters; + RootSignatureDesc() { Header = dxbc::RootSignatureHeader{2, 0}; } void write(raw_ostream &OS) const; }; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index ddcc025c15460..96aa19141bc4b 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -119,12 +119,12 @@ namespace DirectX { class RootSignature { private: - uint32_t Version; - uint32_t NumParameters; - uint32_t RootParametersOffset; - uint32_t NumStaticSamplers; - uint32_t StaticSamplersOffset; - uint32_t Flags; + uint32_t Version = 2; + uint32_t NumParameters = 0; + uint32_t RootParametersOffset = 0; + uint32_t NumStaticSamplers = 0; + uint32_t StaticSamplersOffset = 0; + uint32_t Flags = 0; SmallVector Parameters; From 4b177e262e8ed4496c08006905b40771f4b83cf3 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Wed, 26 Feb 2025 19:44:56 +0000 Subject: [PATCH 220/220] addressing pr comments --- llvm/include/llvm/BinaryFormat/DXContainer.h | 7 +------ llvm/include/llvm/MC/DXContainerRootSignature.h | 2 +- llvm/include/llvm/Object/DXContainer.h | 4 +++- llvm/lib/Object/DXContainer.cpp | 4 ++-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index af65587630105..2f6b248055826 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -579,13 +579,7 @@ struct RootParameter { }; dxbc::ShaderVisibility ShaderVisibility; - RootParameter() { - ParameterType = dxbc::RootParameterType::Empty; - ShaderVisibility = dxbc::ShaderVisibility::Empty; - } - void swapBytes() { - sys::swapByteOrder(ParameterType); sys::swapByteOrder(ShaderVisibility); switch (ParameterType) { case RootParameterType::Constants32Bit: @@ -595,6 +589,7 @@ struct RootParameter { llvm_unreachable("invalid value for ParameterType"); break; } + sys::swapByteOrder(ParameterType); } }; diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 02da4eeae3df2..ffd1c034768de 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -19,7 +19,7 @@ struct RootSignatureDesc { dxbc::RootSignatureHeader Header; SmallVector Parameters; - RootSignatureDesc() { Header = dxbc::RootSignatureHeader{2, 0}; } + RootSignatureDesc() : Header(dxbc::RootSignatureHeader{2, 0}) {} void write(raw_ostream &OS) const; }; diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 96aa19141bc4b..631744bd4fd94 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -137,7 +137,9 @@ class RootSignature { uint32_t getRootParametersOffset() const { return RootParametersOffset; } uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } - SmallVector getParameters() const { return Parameters; } + const SmallVector &getParameters() const { + return Parameters; + } uint32_t getFlags() const { return Flags; } }; diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 35261b661cf2f..1a0463e8ac850 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -319,10 +319,10 @@ Error DirectX::RootSignature::parse(StringRef Data) { switch (NewParam.ParameterType) { - case dxbc::RootParameterType::Constants32Bit: { + case dxbc::RootParameterType::Constants32Bit: if (Error Err = readStruct(Data, Begin + Offset, NewParam.Constants)) return Err; - } break; + break; case dxbc::RootParameterType::Empty: // unreachable because it was validated and assigned before this point. llvm_unreachable("Invalid value for RootParameterType");