Skip to content

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled #143100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 9, 2025

Conversation

phoebewang
Copy link
Contributor

@llvmbot
Copy link
Member

llvmbot commented Jun 6, 2025

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Fixes: https://godbolt.org/z/7jYa3bWK9


Patch is 76.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143100.diff

3 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+8-1)
  • (modified) llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll (+435-40)
  • (modified) llvm/test/CodeGen/X86/avx512fp16-fminnum.ll (+435-40)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e929dab429de5..1555b8a669ae8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55357,10 +55357,17 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  auto IsMinMaxLegal = [&](EVT VT) {
+    if (!TLI.isTypeLegal(VT))
+      return false;
+    return VT.getScalarType() != MVT::f16 ||
+           (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
+  };
+
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
         (Subtarget.hasFP16() && VT == MVT::f16) ||
-        (VT.isVector() && TLI.isTypeLegal(VT))))
+        (VT.isVector() && IsMinMaxLegal(VT))))
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
index 1d535f93bc867..eac803f83e863 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK,HasVL
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK,NOVL
 
 declare half @llvm.maxnum.f16(half, half)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
@@ -9,61 +10,397 @@ declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_intrinsic_fmaxh(half %x, half %y) {
-; CHECK-LABEL: test_intrinsic_fmaxh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmaxh:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; HasVL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmaxh:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; NOVL-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call half @llvm.maxnum.f16(half %x, half %y) readnone
   ret half %z
 }
 
 define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v2f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v2f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
   ret <2 x half> %z
 }
 
 define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v4f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v4f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
   ret <4 x half> %z
 }
 
 define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v8f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v8f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm...
[truncated]

return false;
return VT.getScalarType() != MVT::f16 ||
(Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not just widen (with zeros) to v32f16?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a blocker issue. The general combiner always combines extract_subvector(insert_subvector(BinOP X, Y)) to BinOP X, Y. I created #143298 to show the problem.

OTOH, the AVX512FP16 w/o AVX512VL case doesn't occur in any real HW. We just need to make sure no crash here. Performance is not a concern.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm

@phoebewang phoebewang merged commit 4fbf67f into llvm:main Jun 9, 2025
9 checks passed
@phoebewang phoebewang deleted the minmax branch June 9, 2025 00:35
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants