From 3a77522387ccff149c3a59eaf86ca4b574061134 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 17 Feb 2024 10:19:27 +0000 Subject: [PATCH] [AArch64][GlobalISel] Improve and expand fcopysign lowering (#71283) This alters the lowering of G_COPYSIGN to support vector types. The general idea is that we just lower it to vector operations using and/or and a mask, which are now converted to a BIF/BIT/BSP. In the process the existing AArch64LegalizerInfo::legalizeFCopySign can be removed, replying on expanding the scalar versions to vector instead, which just needs a small adjustment to allow widening scalars to vectors. --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 1 + .../CodeGen/GlobalISel/MachineIRBuilder.cpp | 15 +- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 73 +- .../AArch64/GISel/AArch64LegalizerInfo.h | 1 - .../AArch64/GlobalISel/legalize-fcopysign.mir | 40 +- .../GlobalISel/legalizer-info-validation.mir | 4 +- llvm/test/CodeGen/AArch64/fcopysign.ll | 739 ++++++++++-------- 7 files changed, 454 insertions(+), 419 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 3870e66ff2662..e5b229fcd54f5 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5210,6 +5210,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: + case TargetOpcode::G_FCOPYSIGN: case TargetOpcode::G_UADDSAT: case TargetOpcode::G_USUBSAT: case TargetOpcode::G_SADDSAT: diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index d58b62871817d..8feb708b78351 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -269,14 +269,19 @@ MachineIRBuilder::buildDeleteTrailingVectorElements(const DstOp &Res, LLT ResTy = Res.getLLTTy(*getMRI()); LLT Op0Ty = Op0.getLLTTy(*getMRI()); - assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type"); - assert((ResTy.getElementType() == Op0Ty.getElementType()) && + assert(Op0Ty.isVector() && "Non vector type"); + assert(((ResTy.isScalar() && (ResTy == Op0Ty.getElementType())) || + (ResTy.isVector() && + (ResTy.getElementType() == Op0Ty.getElementType()))) && "Different vector element types"); - assert((ResTy.getNumElements() < Op0Ty.getNumElements()) && - "Op0 has fewer elements"); + assert( + (ResTy.isScalar() || (ResTy.getNumElements() < Op0Ty.getNumElements())) && + "Op0 has fewer elements"); - SmallVector Regs; auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0); + if (ResTy.isScalar()) + return buildCopy(Res, Unmerge.getReg(0)); + SmallVector Regs; for (unsigned i = 0; i < ResTy.getNumElements(); ++i) Regs.push_back(Unmerge.getReg(i)); return buildMergeLikeInstr(Res, Regs); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 933f13dd5a19a..4a3f710163e72 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1157,10 +1157,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) .legalFor({{s64, s32}, {s64, s64}}); - // TODO: Custom legalization for vector types. // TODO: Custom legalization for mismatched types. - // TODO: s16 support. - getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}}); + getActionDefinitionsBuilder(G_FCOPYSIGN) + .moreElementsIf( + [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); }, + [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty)); + }) + .lower(); getActionDefinitionsBuilder(G_FMAD).lower(); @@ -1217,8 +1222,6 @@ bool AArch64LegalizerInfo::legalizeCustom( case TargetOpcode::G_MEMMOVE: case TargetOpcode::G_MEMSET: return legalizeMemOps(MI, Helper); - case TargetOpcode::G_FCOPYSIGN: - return legalizeFCopySign(MI, Helper); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, Helper); case TargetOpcode::G_DYN_STACKALLOC: @@ -1960,66 +1963,6 @@ bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, return false; } -bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, - LegalizerHelper &Helper) const { - MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - Register Dst = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(Dst); - assert(DstTy.isScalar() && "Only expected scalars right now!"); - const unsigned DstSize = DstTy.getSizeInBits(); - assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!"); - assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy && - "Expected homogeneous types!"); - - // We want to materialize a mask with the high bit set. - uint64_t EltMask; - LLT VecTy; - - // TODO: s16 support. - switch (DstSize) { - default: - llvm_unreachable("Unexpected type for G_FCOPYSIGN!"); - case 64: { - // AdvSIMD immediate moves cannot materialize out mask in a single - // instruction for 64-bit elements. Instead, materialize zero and then - // negate it. - EltMask = 0; - VecTy = LLT::fixed_vector(2, DstTy); - break; - } - case 32: - EltMask = 0x80000000ULL; - VecTy = LLT::fixed_vector(4, DstTy); - break; - } - - // Widen In1 and In2 to 128 bits. We want these to eventually become - // INSERT_SUBREGs. - auto Undef = MIRBuilder.buildUndef(VecTy); - auto Zero = MIRBuilder.buildConstant(DstTy, 0); - auto Ins1 = MIRBuilder.buildInsertVectorElement( - VecTy, Undef, MI.getOperand(1).getReg(), Zero); - auto Ins2 = MIRBuilder.buildInsertVectorElement( - VecTy, Undef, MI.getOperand(2).getReg(), Zero); - - // Construct the mask. - auto Mask = MIRBuilder.buildConstant(VecTy, EltMask); - if (DstSize == 64) - Mask = MIRBuilder.buildFNeg(VecTy, Mask); - - auto Sel = MIRBuilder.buildInstr(AArch64::G_BSP, {VecTy}, {Mask, Ins2, Ins1}); - - // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We - // want this to eventually become an EXTRACT_SUBREG. - SmallVector DstRegs(1, Dst); - for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I) - DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy)); - MIRBuilder.buildUnmerge(DstRegs, Sel); - MI.eraseFromParent(); - return true; -} - bool AArch64LegalizerInfo::legalizeExtractVectorElt( MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index c62a9d847c52f..b69d9b015bd2b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -60,7 +60,6 @@ class AArch64LegalizerInfo : public LegalizerInfo { LegalizerHelper &Helper) const; bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const; bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const; - bool legalizeFCopySign(MachineInstr &MI, LegalizerHelper &Helper) const; bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const; bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir index 86824127132da..dd794b7af9466 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir @@ -13,14 +13,18 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %val:_(s32) = COPY $s0 ; CHECK-NEXT: %sign:_(s32) = COPY $s1 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %val(s32), [[C]](s32) - ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %sign(s32), [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) - ; CHECK-NEXT: [[BSP:%[0-9]+]]:_(<4 x s32>) = G_BSP [[BUILD_VECTOR]], [[IVEC1]], [[IVEC]] - ; CHECK-NEXT: %fcopysign:_(s32), %10:_(s32), %11:_(s32), %12:_(s32) = G_UNMERGE_VALUES [[BSP]](<4 x s32>) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %val(s32), [[DEF]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %sign(s32), [[DEF]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR]](<2 x s32>) + ; CHECK-NEXT: %fcopysign:_(s32) = COPY [[UV]](s32) ; CHECK-NEXT: $s0 = COPY %fcopysign(s32) ; CHECK-NEXT: RET_ReallyLR implicit $s0 %val:_(s32) = COPY $s0 @@ -41,14 +45,18 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %val:_(s64) = COPY $d0 ; CHECK-NEXT: %sign:_(s64) = COPY $d1 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], %val(s64), [[C]](s64) - ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], %sign(s64), [[C]](s64) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s64>) = G_FNEG [[BUILD_VECTOR]] - ; CHECK-NEXT: [[BSP:%[0-9]+]]:_(<2 x s64>) = G_BSP [[FNEG]], [[IVEC1]], [[IVEC]] - ; CHECK-NEXT: %fcopysign:_(s64), %10:_(s64) = G_UNMERGE_VALUES [[BSP]](<2 x s64>) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %val(s64), [[DEF]](s64) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR %sign(s64), [[DEF]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C1]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s64>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s64>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[OR]](<2 x s64>) + ; CHECK-NEXT: %fcopysign:_(s64) = COPY [[UV]](s64) ; CHECK-NEXT: $d0 = COPY %fcopysign(s64) ; CHECK-NEXT: RET_ReallyLR implicit $d0 %val:_(s64) = COPY $d0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index aaf2fef954590..381897b1835de 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -526,8 +526,8 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FCOPYSIGN (opcode {{[0-9]+}}): 2 type indices -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_IS_FPCLASS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll index a1c48bd943e2f..89e78f7147490 100644 --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -1,29 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define double @copysign_f64(double %a, double %b) { -; CHECK-SD-LABEL: copysign_f64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: movi v2.2d, #0xffffffffffffffff -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: fneg v2.2d, v2.2d -; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: copysign_f64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov v0.d[0], v0.d[0] -; CHECK-GI-NEXT: mov v1.d[0], v1.d[0] -; CHECK-GI-NEXT: fneg v2.2d, v2.2d -; CHECK-GI-NEXT: bit v0.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: copysign_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fneg v2.2d, v2.2d +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %c = call double @llvm.copysign.f64(double %a, double %b) ret double %c @@ -41,13 +29,11 @@ define float @copysign_f32(float %a, float %b) { ; ; CHECK-GI-LABEL: copysign_f32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-GI-NEXT: movi v2.4s, #128, lsl #24 -; CHECK-GI-NEXT: mov v0.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[0], v1.s[0] -; CHECK-GI-NEXT: bit v0.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-GI-NEXT: mvni v2.2s, #128, lsl #24 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $d0 +; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $d1 +; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 ; CHECK-GI-NEXT: ret entry: %c = call float @llvm.copysign.f32(float %a, float %b) @@ -55,14 +41,23 @@ entry: } define half @copysign_f16(half %a, half %b) { -; CHECK-LABEL: copysign_f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: fcvt h0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-GI-NEXT: ret entry: %c = call half @llvm.copysign.f16(half %a, half %b) ret half %c @@ -81,25 +76,46 @@ entry: } define <3 x double> @copysign_v3f64(<3 x double> %a, <3 x double> %b) { -; CHECK-LABEL: copysign_v3f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v6.2d, #0xffffffffffffffff -; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $d4 killed $d4 def $q4 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: // kill: def $d5 killed $d5 def $q5 -; CHECK-NEXT: mov v3.d[1], v4.d[0] -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: fneg v1.2d, v6.2d -; CHECK-NEXT: bif v0.16b, v3.16b, v1.16b -; CHECK-NEXT: bif v2.16b, v5.16b, v1.16b -; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q2 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_v3f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v6.2d, #0xffffffffffffffff +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-SD-NEXT: mov v3.d[1], v4.d[0] +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: fneg v1.2d, v6.2d +; CHECK-SD-NEXT: bif v0.16b, v3.16b, v1.16b +; CHECK-SD-NEXT: bif v2.16b, v5.16b, v1.16b +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_v3f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v6.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff +; CHECK-GI-NEXT: and x9, x9, #0x8000000000000000 +; CHECK-GI-NEXT: fneg v1.2d, v6.2d +; CHECK-GI-NEXT: orr x8, x8, x9 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: bif v0.16b, v3.16b, v1.16b +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = call <3 x double> @llvm.copysign.v3f64(<3 x double> %a, <3 x double> %b) ret <3 x double> %c @@ -130,11 +146,28 @@ entry: } define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) { -; CHECK-LABEL: copysign_v3f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_v3f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_v3f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #-2147483648 // =0x80000000 +; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mov v3.s[1], w8 +; CHECK-GI-NEXT: mov v2.s[2], w9 +; CHECK-GI-NEXT: mov v3.s[2], w8 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret entry: %c = call <3 x float> @llvm.copysign.v3f32(<3 x float> %a, <3 x float> %b) ret <3 x float> %c @@ -164,293 +197,339 @@ entry: } define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) { -; CHECK-LABEL: copysign_v7f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h4, v0.h[1] -; CHECK-NEXT: fcvt s5, h1 -; CHECK-NEXT: fcvt s6, h0 -; CHECK-NEXT: mvni v3.4s, #128, lsl #24 -; CHECK-NEXT: mov h7, v1.h[2] -; CHECK-NEXT: mov h16, v0.h[2] -; CHECK-NEXT: mov h17, v1.h[3] -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: bit v5.16b, v6.16b, v3.16b -; CHECK-NEXT: mov h6, v0.h[3] -; CHECK-NEXT: fcvt s7, h7 -; CHECK-NEXT: fcvt s16, h16 -; CHECK-NEXT: fcvt s17, h17 -; CHECK-NEXT: bif v4.16b, v2.16b, v3.16b -; CHECK-NEXT: fcvt h2, s5 -; CHECK-NEXT: mov v5.16b, v3.16b -; CHECK-NEXT: fcvt s6, h6 -; CHECK-NEXT: bsl v5.16b, v16.16b, v7.16b -; CHECK-NEXT: fcvt h4, s4 -; CHECK-NEXT: mov h7, v1.h[4] -; CHECK-NEXT: mov h16, v0.h[4] -; CHECK-NEXT: bif v6.16b, v17.16b, v3.16b -; CHECK-NEXT: mov h17, v0.h[5] -; CHECK-NEXT: fcvt h5, s5 -; CHECK-NEXT: mov v2.h[1], v4.h[0] -; CHECK-NEXT: fcvt s4, h7 -; CHECK-NEXT: fcvt s7, h16 -; CHECK-NEXT: mov h16, v1.h[5] -; CHECK-NEXT: fcvt h6, s6 -; CHECK-NEXT: fcvt s17, h17 -; CHECK-NEXT: mov v2.h[2], v5.h[0] -; CHECK-NEXT: mov h5, v1.h[6] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: bit v4.16b, v7.16b, v3.16b -; CHECK-NEXT: mov h7, v0.h[6] -; CHECK-NEXT: fcvt s16, h16 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: mov v2.h[3], v6.h[0] -; CHECK-NEXT: fcvt s5, h5 -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s6, h7 -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: fcvt h4, s4 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bsl v7.16b, v17.16b, v16.16b -; CHECK-NEXT: bit v5.16b, v6.16b, v3.16b -; CHECK-NEXT: mov v2.h[4], v4.h[0] -; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b -; CHECK-NEXT: fcvt h4, s7 -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: mov v2.h[5], v4.h[0] -; CHECK-NEXT: fcvt h4, s5 -; CHECK-NEXT: mov v2.h[6], v4.h[0] -; CHECK-NEXT: mov v2.h[7], v0.h[0] -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_v7f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov h2, v1.h[1] +; CHECK-SD-NEXT: mov h4, v0.h[1] +; CHECK-SD-NEXT: fcvt s5, h1 +; CHECK-SD-NEXT: fcvt s6, h0 +; CHECK-SD-NEXT: mvni v3.4s, #128, lsl #24 +; CHECK-SD-NEXT: mov h7, v1.h[2] +; CHECK-SD-NEXT: mov h16, v0.h[2] +; CHECK-SD-NEXT: mov h17, v1.h[3] +; CHECK-SD-NEXT: fcvt s2, h2 +; CHECK-SD-NEXT: fcvt s4, h4 +; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b +; CHECK-SD-NEXT: mov h6, v0.h[3] +; CHECK-SD-NEXT: fcvt s7, h7 +; CHECK-SD-NEXT: fcvt s16, h16 +; CHECK-SD-NEXT: fcvt s17, h17 +; CHECK-SD-NEXT: bif v4.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: fcvt h2, s5 +; CHECK-SD-NEXT: mov v5.16b, v3.16b +; CHECK-SD-NEXT: fcvt s6, h6 +; CHECK-SD-NEXT: bsl v5.16b, v16.16b, v7.16b +; CHECK-SD-NEXT: fcvt h4, s4 +; CHECK-SD-NEXT: mov h7, v1.h[4] +; CHECK-SD-NEXT: mov h16, v0.h[4] +; CHECK-SD-NEXT: bif v6.16b, v17.16b, v3.16b +; CHECK-SD-NEXT: mov h17, v0.h[5] +; CHECK-SD-NEXT: fcvt h5, s5 +; CHECK-SD-NEXT: mov v2.h[1], v4.h[0] +; CHECK-SD-NEXT: fcvt s4, h7 +; CHECK-SD-NEXT: fcvt s7, h16 +; CHECK-SD-NEXT: mov h16, v1.h[5] +; CHECK-SD-NEXT: fcvt h6, s6 +; CHECK-SD-NEXT: fcvt s17, h17 +; CHECK-SD-NEXT: mov v2.h[2], v5.h[0] +; CHECK-SD-NEXT: mov h5, v1.h[6] +; CHECK-SD-NEXT: mov h1, v1.h[7] +; CHECK-SD-NEXT: bit v4.16b, v7.16b, v3.16b +; CHECK-SD-NEXT: mov h7, v0.h[6] +; CHECK-SD-NEXT: fcvt s16, h16 +; CHECK-SD-NEXT: mov h0, v0.h[7] +; CHECK-SD-NEXT: mov v2.h[3], v6.h[0] +; CHECK-SD-NEXT: fcvt s5, h5 +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: fcvt s6, h7 +; CHECK-SD-NEXT: mov v7.16b, v3.16b +; CHECK-SD-NEXT: fcvt h4, s4 +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: bsl v7.16b, v17.16b, v16.16b +; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b +; CHECK-SD-NEXT: mov v2.h[4], v4.h[0] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: fcvt h4, s7 +; CHECK-SD-NEXT: fcvt h0, s0 +; CHECK-SD-NEXT: mov v2.h[5], v4.h[0] +; CHECK-SD-NEXT: fcvt h4, s5 +; CHECK-SD-NEXT: mov v2.h[6], v4.h[0] +; CHECK-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_v7f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #32768 // =0x8000 +; CHECK-GI-NEXT: mov w9, #32767 // =0x7fff +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v4.16b, v2.16b +; CHECK-GI-NEXT: mov v5.16b, v3.16b +; CHECK-GI-NEXT: mov v4.h[1], v2.h[0] +; CHECK-GI-NEXT: mov v5.h[1], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v5.h[2], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[3], v2.h[0] +; CHECK-GI-NEXT: mov v5.h[3], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[4], v2.h[0] +; CHECK-GI-NEXT: mov v5.h[4], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[5], v2.h[0] +; CHECK-GI-NEXT: mov v5.h[5], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[6], v2.h[0] +; CHECK-GI-NEXT: mov v5.h[6], v3.h[0] +; CHECK-GI-NEXT: mov v4.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v5.h[7], v0.h[0] +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v5.16b +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret entry: %c = call <7 x half> @llvm.copysign.v7f16(<7 x half> %a, <7 x half> %b) ret <7 x half> %c } define <4 x half> @copysign_v4f16(<4 x half> %a, <4 x half> %b) { -; CHECK-LABEL: copysign_v4f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h3, v1.h[1] -; CHECK-NEXT: mov h4, v0.h[1] -; CHECK-NEXT: mov h5, v1.h[2] -; CHECK-NEXT: mov h6, v0.h[2] -; CHECK-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-NEXT: fcvt s7, h1 -; CHECK-NEXT: fcvt s16, h0 -; CHECK-NEXT: mov h1, v1.h[3] -; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: bit v3.16b, v4.16b, v2.16b -; CHECK-NEXT: fcvt s4, h5 -; CHECK-NEXT: fcvt s5, h6 -; CHECK-NEXT: mov v6.16b, v2.16b -; CHECK-NEXT: bsl v6.16b, v16.16b, v7.16b -; CHECK-NEXT: mov h7, v0.h[3] -; CHECK-NEXT: bit v4.16b, v5.16b, v2.16b -; CHECK-NEXT: fcvt h3, s3 -; CHECK-NEXT: fcvt h0, s6 -; CHECK-NEXT: fcvt s5, h7 -; CHECK-NEXT: mov v0.h[1], v3.h[0] -; CHECK-NEXT: fcvt h3, s4 -; CHECK-NEXT: bit v1.16b, v5.16b, v2.16b -; CHECK-NEXT: mov v0.h[2], v3.h[0] -; CHECK-NEXT: fcvt h1, s1 -; CHECK-NEXT: mov v0.h[3], v1.h[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_v4f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov h3, v1.h[1] +; CHECK-SD-NEXT: mov h4, v0.h[1] +; CHECK-SD-NEXT: mov h5, v1.h[2] +; CHECK-SD-NEXT: mov h6, v0.h[2] +; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-SD-NEXT: fcvt s7, h1 +; CHECK-SD-NEXT: fcvt s16, h0 +; CHECK-SD-NEXT: mov h1, v1.h[3] +; CHECK-SD-NEXT: fcvt s3, h3 +; CHECK-SD-NEXT: fcvt s4, h4 +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: bit v3.16b, v4.16b, v2.16b +; CHECK-SD-NEXT: fcvt s4, h5 +; CHECK-SD-NEXT: fcvt s5, h6 +; CHECK-SD-NEXT: mov v6.16b, v2.16b +; CHECK-SD-NEXT: bsl v6.16b, v16.16b, v7.16b +; CHECK-SD-NEXT: mov h7, v0.h[3] +; CHECK-SD-NEXT: bit v4.16b, v5.16b, v2.16b +; CHECK-SD-NEXT: fcvt h3, s3 +; CHECK-SD-NEXT: fcvt h0, s6 +; CHECK-SD-NEXT: fcvt s5, h7 +; CHECK-SD-NEXT: mov v0.h[1], v3.h[0] +; CHECK-SD-NEXT: fcvt h3, s4 +; CHECK-SD-NEXT: bit v1.16b, v5.16b, v2.16b +; CHECK-SD-NEXT: mov v0.h[2], v3.h[0] +; CHECK-SD-NEXT: fcvt h1, s1 +; CHECK-SD-NEXT: mov v0.h[3], v1.h[0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_v4f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mvni v2.4h, #128, lsl #8 +; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ret entry: %c = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %c } define <8 x half> @copysign_v8f16(<8 x half> %a, <8 x half> %b) { -; CHECK-LABEL: copysign_v8f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h4, v0.h[1] -; CHECK-NEXT: fcvt s5, h1 -; CHECK-NEXT: fcvt s6, h0 -; CHECK-NEXT: mvni v3.4s, #128, lsl #24 -; CHECK-NEXT: mov h7, v1.h[2] -; CHECK-NEXT: mov h16, v0.h[2] -; CHECK-NEXT: mov h17, v1.h[3] -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: bit v5.16b, v6.16b, v3.16b -; CHECK-NEXT: mov h6, v0.h[3] -; CHECK-NEXT: fcvt s7, h7 -; CHECK-NEXT: fcvt s16, h16 -; CHECK-NEXT: fcvt s17, h17 -; CHECK-NEXT: bif v4.16b, v2.16b, v3.16b -; CHECK-NEXT: fcvt h2, s5 -; CHECK-NEXT: mov v5.16b, v3.16b -; CHECK-NEXT: fcvt s6, h6 -; CHECK-NEXT: bsl v5.16b, v16.16b, v7.16b -; CHECK-NEXT: fcvt h4, s4 -; CHECK-NEXT: mov h7, v1.h[4] -; CHECK-NEXT: mov h16, v0.h[4] -; CHECK-NEXT: bif v6.16b, v17.16b, v3.16b -; CHECK-NEXT: mov h17, v0.h[5] -; CHECK-NEXT: fcvt h5, s5 -; CHECK-NEXT: mov v2.h[1], v4.h[0] -; CHECK-NEXT: fcvt s4, h7 -; CHECK-NEXT: fcvt s7, h16 -; CHECK-NEXT: mov h16, v1.h[5] -; CHECK-NEXT: fcvt h6, s6 -; CHECK-NEXT: fcvt s17, h17 -; CHECK-NEXT: mov v2.h[2], v5.h[0] -; CHECK-NEXT: mov h5, v1.h[6] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: bit v4.16b, v7.16b, v3.16b -; CHECK-NEXT: mov h7, v0.h[6] -; CHECK-NEXT: fcvt s16, h16 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: mov v2.h[3], v6.h[0] -; CHECK-NEXT: fcvt s5, h5 -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s6, h7 -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: fcvt h4, s4 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bsl v7.16b, v17.16b, v16.16b -; CHECK-NEXT: bit v5.16b, v6.16b, v3.16b -; CHECK-NEXT: mov v2.h[4], v4.h[0] -; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b -; CHECK-NEXT: fcvt h4, s7 -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: mov v2.h[5], v4.h[0] -; CHECK-NEXT: fcvt h4, s5 -; CHECK-NEXT: mov v2.h[6], v4.h[0] -; CHECK-NEXT: mov v2.h[7], v0.h[0] -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_v8f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov h2, v1.h[1] +; CHECK-SD-NEXT: mov h4, v0.h[1] +; CHECK-SD-NEXT: fcvt s5, h1 +; CHECK-SD-NEXT: fcvt s6, h0 +; CHECK-SD-NEXT: mvni v3.4s, #128, lsl #24 +; CHECK-SD-NEXT: mov h7, v1.h[2] +; CHECK-SD-NEXT: mov h16, v0.h[2] +; CHECK-SD-NEXT: mov h17, v1.h[3] +; CHECK-SD-NEXT: fcvt s2, h2 +; CHECK-SD-NEXT: fcvt s4, h4 +; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b +; CHECK-SD-NEXT: mov h6, v0.h[3] +; CHECK-SD-NEXT: fcvt s7, h7 +; CHECK-SD-NEXT: fcvt s16, h16 +; CHECK-SD-NEXT: fcvt s17, h17 +; CHECK-SD-NEXT: bif v4.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: fcvt h2, s5 +; CHECK-SD-NEXT: mov v5.16b, v3.16b +; CHECK-SD-NEXT: fcvt s6, h6 +; CHECK-SD-NEXT: bsl v5.16b, v16.16b, v7.16b +; CHECK-SD-NEXT: fcvt h4, s4 +; CHECK-SD-NEXT: mov h7, v1.h[4] +; CHECK-SD-NEXT: mov h16, v0.h[4] +; CHECK-SD-NEXT: bif v6.16b, v17.16b, v3.16b +; CHECK-SD-NEXT: mov h17, v0.h[5] +; CHECK-SD-NEXT: fcvt h5, s5 +; CHECK-SD-NEXT: mov v2.h[1], v4.h[0] +; CHECK-SD-NEXT: fcvt s4, h7 +; CHECK-SD-NEXT: fcvt s7, h16 +; CHECK-SD-NEXT: mov h16, v1.h[5] +; CHECK-SD-NEXT: fcvt h6, s6 +; CHECK-SD-NEXT: fcvt s17, h17 +; CHECK-SD-NEXT: mov v2.h[2], v5.h[0] +; CHECK-SD-NEXT: mov h5, v1.h[6] +; CHECK-SD-NEXT: mov h1, v1.h[7] +; CHECK-SD-NEXT: bit v4.16b, v7.16b, v3.16b +; CHECK-SD-NEXT: mov h7, v0.h[6] +; CHECK-SD-NEXT: fcvt s16, h16 +; CHECK-SD-NEXT: mov h0, v0.h[7] +; CHECK-SD-NEXT: mov v2.h[3], v6.h[0] +; CHECK-SD-NEXT: fcvt s5, h5 +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: fcvt s6, h7 +; CHECK-SD-NEXT: mov v7.16b, v3.16b +; CHECK-SD-NEXT: fcvt h4, s4 +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: bsl v7.16b, v17.16b, v16.16b +; CHECK-SD-NEXT: bit v5.16b, v6.16b, v3.16b +; CHECK-SD-NEXT: mov v2.h[4], v4.h[0] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: fcvt h4, s7 +; CHECK-SD-NEXT: fcvt h0, s0 +; CHECK-SD-NEXT: mov v2.h[5], v4.h[0] +; CHECK-SD-NEXT: fcvt h4, s5 +; CHECK-SD-NEXT: mov v2.h[6], v4.h[0] +; CHECK-SD-NEXT: mov v2.h[7], v0.h[0] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_v8f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mvni v2.8h, #128, lsl #8 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %c = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) ret <8 x half> %c } define <16 x half> @copysign_v16f16(<16 x half> %a, <16 x half> %b) { -; CHECK-LABEL: copysign_v16f16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov h4, v2.h[1] -; CHECK-NEXT: mov h5, v0.h[1] -; CHECK-NEXT: mvni v6.4s, #128, lsl #24 -; CHECK-NEXT: mov h7, v3.h[1] -; CHECK-NEXT: mov h16, v1.h[1] -; CHECK-NEXT: fcvt s17, h2 -; CHECK-NEXT: fcvt s18, h0 -; CHECK-NEXT: mov h19, v2.h[2] -; CHECK-NEXT: mov h20, v0.h[2] -; CHECK-NEXT: fcvt s21, h3 -; CHECK-NEXT: fcvt s22, h1 -; CHECK-NEXT: mov h23, v3.h[2] -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: fcvt s5, h5 -; CHECK-NEXT: mov h24, v1.h[2] -; CHECK-NEXT: fcvt s7, h7 -; CHECK-NEXT: fcvt s16, h16 -; CHECK-NEXT: mov h25, v1.h[3] -; CHECK-NEXT: mov h26, v1.h[6] -; CHECK-NEXT: bit v21.16b, v22.16b, v6.16b -; CHECK-NEXT: fcvt s22, h23 -; CHECK-NEXT: bit v4.16b, v5.16b, v6.16b -; CHECK-NEXT: mov v5.16b, v6.16b -; CHECK-NEXT: fcvt s23, h24 -; CHECK-NEXT: bit v7.16b, v16.16b, v6.16b -; CHECK-NEXT: mov h24, v3.h[3] -; CHECK-NEXT: bsl v5.16b, v18.16b, v17.16b -; CHECK-NEXT: fcvt s18, h19 -; CHECK-NEXT: fcvt s19, h20 -; CHECK-NEXT: mov h20, v0.h[3] -; CHECK-NEXT: mov h17, v2.h[3] -; CHECK-NEXT: fcvt h16, s4 -; CHECK-NEXT: fcvt h7, s7 -; CHECK-NEXT: fcvt h4, s5 -; CHECK-NEXT: bit v18.16b, v19.16b, v6.16b -; CHECK-NEXT: fcvt h5, s21 -; CHECK-NEXT: fcvt s19, h20 -; CHECK-NEXT: mov v20.16b, v6.16b -; CHECK-NEXT: fcvt s17, h17 -; CHECK-NEXT: fcvt s21, h25 -; CHECK-NEXT: mov h25, v0.h[6] -; CHECK-NEXT: bsl v20.16b, v23.16b, v22.16b -; CHECK-NEXT: mov v4.h[1], v16.h[0] -; CHECK-NEXT: fcvt s16, h24 -; CHECK-NEXT: fcvt h18, s18 -; CHECK-NEXT: mov h22, v2.h[4] -; CHECK-NEXT: mov h23, v0.h[4] -; CHECK-NEXT: bit v17.16b, v19.16b, v6.16b -; CHECK-NEXT: mov h19, v3.h[4] -; CHECK-NEXT: mov h24, v1.h[4] -; CHECK-NEXT: mov v5.h[1], v7.h[0] -; CHECK-NEXT: fcvt h7, s20 -; CHECK-NEXT: bit v16.16b, v21.16b, v6.16b -; CHECK-NEXT: mov v4.h[2], v18.h[0] -; CHECK-NEXT: fcvt s18, h22 -; CHECK-NEXT: fcvt s20, h23 -; CHECK-NEXT: fcvt h17, s17 -; CHECK-NEXT: fcvt s19, h19 -; CHECK-NEXT: fcvt s21, h24 -; CHECK-NEXT: mov h22, v2.h[5] -; CHECK-NEXT: mov h23, v0.h[5] -; CHECK-NEXT: mov h24, v1.h[5] -; CHECK-NEXT: mov v5.h[2], v7.h[0] -; CHECK-NEXT: fcvt h7, s16 -; CHECK-NEXT: mov h16, v3.h[5] -; CHECK-NEXT: bit v18.16b, v20.16b, v6.16b -; CHECK-NEXT: mov h20, v2.h[6] -; CHECK-NEXT: mov h2, v2.h[7] -; CHECK-NEXT: bit v19.16b, v21.16b, v6.16b -; CHECK-NEXT: mov h21, v3.h[6] -; CHECK-NEXT: mov v4.h[3], v17.h[0] -; CHECK-NEXT: fcvt s17, h22 -; CHECK-NEXT: fcvt s22, h23 -; CHECK-NEXT: fcvt s23, h25 -; CHECK-NEXT: mov v5.h[3], v7.h[0] -; CHECK-NEXT: fcvt s7, h16 -; CHECK-NEXT: fcvt s16, h24 -; CHECK-NEXT: fcvt h18, s18 -; CHECK-NEXT: fcvt s20, h20 -; CHECK-NEXT: fcvt s24, h26 -; CHECK-NEXT: fcvt h19, s19 -; CHECK-NEXT: fcvt s21, h21 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: bit v17.16b, v22.16b, v6.16b -; CHECK-NEXT: mov h3, v3.h[7] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: bit v7.16b, v16.16b, v6.16b -; CHECK-NEXT: mov v16.16b, v6.16b -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: mov v4.h[4], v18.h[0] -; CHECK-NEXT: mov v18.16b, v6.16b -; CHECK-NEXT: mov v5.h[4], v19.h[0] -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: bsl v16.16b, v23.16b, v20.16b -; CHECK-NEXT: fcvt h17, s17 -; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: bsl v18.16b, v24.16b, v21.16b -; CHECK-NEXT: fcvt h7, s7 -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: bif v0.16b, v2.16b, v6.16b -; CHECK-NEXT: mov v4.h[5], v17.h[0] -; CHECK-NEXT: fcvt h2, s16 -; CHECK-NEXT: mov v5.h[5], v7.h[0] -; CHECK-NEXT: fcvt h7, s18 -; CHECK-NEXT: bif v1.16b, v3.16b, v6.16b -; CHECK-NEXT: fcvt h0, s0 -; CHECK-NEXT: mov v4.h[6], v2.h[0] -; CHECK-NEXT: mov v5.h[6], v7.h[0] -; CHECK-NEXT: fcvt h1, s1 -; CHECK-NEXT: mov v4.h[7], v0.h[0] -; CHECK-NEXT: mov v5.h[7], v1.h[0] -; CHECK-NEXT: mov v0.16b, v4.16b -; CHECK-NEXT: mov v1.16b, v5.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: copysign_v16f16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov h4, v2.h[1] +; CHECK-SD-NEXT: mov h5, v0.h[1] +; CHECK-SD-NEXT: mvni v6.4s, #128, lsl #24 +; CHECK-SD-NEXT: mov h7, v3.h[1] +; CHECK-SD-NEXT: mov h16, v1.h[1] +; CHECK-SD-NEXT: fcvt s17, h2 +; CHECK-SD-NEXT: fcvt s18, h0 +; CHECK-SD-NEXT: mov h19, v2.h[2] +; CHECK-SD-NEXT: mov h20, v0.h[2] +; CHECK-SD-NEXT: fcvt s21, h3 +; CHECK-SD-NEXT: fcvt s22, h1 +; CHECK-SD-NEXT: mov h23, v3.h[2] +; CHECK-SD-NEXT: fcvt s4, h4 +; CHECK-SD-NEXT: fcvt s5, h5 +; CHECK-SD-NEXT: mov h24, v1.h[2] +; CHECK-SD-NEXT: fcvt s7, h7 +; CHECK-SD-NEXT: fcvt s16, h16 +; CHECK-SD-NEXT: mov h25, v1.h[3] +; CHECK-SD-NEXT: mov h26, v1.h[6] +; CHECK-SD-NEXT: bit v21.16b, v22.16b, v6.16b +; CHECK-SD-NEXT: fcvt s22, h23 +; CHECK-SD-NEXT: bit v4.16b, v5.16b, v6.16b +; CHECK-SD-NEXT: mov v5.16b, v6.16b +; CHECK-SD-NEXT: fcvt s23, h24 +; CHECK-SD-NEXT: bit v7.16b, v16.16b, v6.16b +; CHECK-SD-NEXT: mov h24, v3.h[3] +; CHECK-SD-NEXT: bsl v5.16b, v18.16b, v17.16b +; CHECK-SD-NEXT: fcvt s18, h19 +; CHECK-SD-NEXT: fcvt s19, h20 +; CHECK-SD-NEXT: mov h20, v0.h[3] +; CHECK-SD-NEXT: mov h17, v2.h[3] +; CHECK-SD-NEXT: fcvt h16, s4 +; CHECK-SD-NEXT: fcvt h7, s7 +; CHECK-SD-NEXT: fcvt h4, s5 +; CHECK-SD-NEXT: bit v18.16b, v19.16b, v6.16b +; CHECK-SD-NEXT: fcvt h5, s21 +; CHECK-SD-NEXT: fcvt s19, h20 +; CHECK-SD-NEXT: mov v20.16b, v6.16b +; CHECK-SD-NEXT: fcvt s17, h17 +; CHECK-SD-NEXT: fcvt s21, h25 +; CHECK-SD-NEXT: mov h25, v0.h[6] +; CHECK-SD-NEXT: bsl v20.16b, v23.16b, v22.16b +; CHECK-SD-NEXT: mov v4.h[1], v16.h[0] +; CHECK-SD-NEXT: fcvt s16, h24 +; CHECK-SD-NEXT: fcvt h18, s18 +; CHECK-SD-NEXT: mov h22, v2.h[4] +; CHECK-SD-NEXT: mov h23, v0.h[4] +; CHECK-SD-NEXT: bit v17.16b, v19.16b, v6.16b +; CHECK-SD-NEXT: mov h19, v3.h[4] +; CHECK-SD-NEXT: mov h24, v1.h[4] +; CHECK-SD-NEXT: mov v5.h[1], v7.h[0] +; CHECK-SD-NEXT: fcvt h7, s20 +; CHECK-SD-NEXT: bit v16.16b, v21.16b, v6.16b +; CHECK-SD-NEXT: mov v4.h[2], v18.h[0] +; CHECK-SD-NEXT: fcvt s18, h22 +; CHECK-SD-NEXT: fcvt s20, h23 +; CHECK-SD-NEXT: fcvt h17, s17 +; CHECK-SD-NEXT: fcvt s19, h19 +; CHECK-SD-NEXT: fcvt s21, h24 +; CHECK-SD-NEXT: mov h22, v2.h[5] +; CHECK-SD-NEXT: mov h23, v0.h[5] +; CHECK-SD-NEXT: mov h24, v1.h[5] +; CHECK-SD-NEXT: mov v5.h[2], v7.h[0] +; CHECK-SD-NEXT: fcvt h7, s16 +; CHECK-SD-NEXT: mov h16, v3.h[5] +; CHECK-SD-NEXT: bit v18.16b, v20.16b, v6.16b +; CHECK-SD-NEXT: mov h20, v2.h[6] +; CHECK-SD-NEXT: mov h2, v2.h[7] +; CHECK-SD-NEXT: bit v19.16b, v21.16b, v6.16b +; CHECK-SD-NEXT: mov h21, v3.h[6] +; CHECK-SD-NEXT: mov v4.h[3], v17.h[0] +; CHECK-SD-NEXT: fcvt s17, h22 +; CHECK-SD-NEXT: fcvt s22, h23 +; CHECK-SD-NEXT: fcvt s23, h25 +; CHECK-SD-NEXT: mov v5.h[3], v7.h[0] +; CHECK-SD-NEXT: fcvt s7, h16 +; CHECK-SD-NEXT: fcvt s16, h24 +; CHECK-SD-NEXT: fcvt h18, s18 +; CHECK-SD-NEXT: fcvt s20, h20 +; CHECK-SD-NEXT: fcvt s24, h26 +; CHECK-SD-NEXT: fcvt h19, s19 +; CHECK-SD-NEXT: fcvt s21, h21 +; CHECK-SD-NEXT: mov h0, v0.h[7] +; CHECK-SD-NEXT: bit v17.16b, v22.16b, v6.16b +; CHECK-SD-NEXT: mov h3, v3.h[7] +; CHECK-SD-NEXT: mov h1, v1.h[7] +; CHECK-SD-NEXT: bit v7.16b, v16.16b, v6.16b +; CHECK-SD-NEXT: mov v16.16b, v6.16b +; CHECK-SD-NEXT: fcvt s2, h2 +; CHECK-SD-NEXT: mov v4.h[4], v18.h[0] +; CHECK-SD-NEXT: mov v18.16b, v6.16b +; CHECK-SD-NEXT: mov v5.h[4], v19.h[0] +; CHECK-SD-NEXT: fcvt s0, h0 +; CHECK-SD-NEXT: bsl v16.16b, v23.16b, v20.16b +; CHECK-SD-NEXT: fcvt h17, s17 +; CHECK-SD-NEXT: fcvt s3, h3 +; CHECK-SD-NEXT: bsl v18.16b, v24.16b, v21.16b +; CHECK-SD-NEXT: fcvt h7, s7 +; CHECK-SD-NEXT: fcvt s1, h1 +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v6.16b +; CHECK-SD-NEXT: mov v4.h[5], v17.h[0] +; CHECK-SD-NEXT: fcvt h2, s16 +; CHECK-SD-NEXT: mov v5.h[5], v7.h[0] +; CHECK-SD-NEXT: fcvt h7, s18 +; CHECK-SD-NEXT: bif v1.16b, v3.16b, v6.16b +; CHECK-SD-NEXT: fcvt h0, s0 +; CHECK-SD-NEXT: mov v4.h[6], v2.h[0] +; CHECK-SD-NEXT: mov v5.h[6], v7.h[0] +; CHECK-SD-NEXT: fcvt h1, s1 +; CHECK-SD-NEXT: mov v4.h[7], v0.h[0] +; CHECK-SD-NEXT: mov v5.h[7], v1.h[0] +; CHECK-SD-NEXT: mov v0.16b, v4.16b +; CHECK-SD-NEXT: mov v1.16b, v5.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: copysign_v16f16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mvni v4.8h, #128, lsl #8 +; CHECK-GI-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-GI-NEXT: ret entry: %c = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b) ret <16 x half> %c