[X86] Extend combinei64TruncSrlAdd to handle patterns with or and xor (#128435)

joaotgouveia · web-flow · commit 075141802444 · 2025-03-01T18:05:49.000+08:00
As discussed in #126448, the fold implemented by #126448 / #128353 can be extended to operations other than `add`. This patch extends the fold performed by `combinei64TruncSrlAdd` to include `or` and `xor` (proof: https://alive2.llvm.org/ce/z/AXuaQu). There's no need to extend it to `sub` and `and`, as similar folds are already being performed for those operations. CC: @phoebewang @RKSimon
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53788,36 +53788,35 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
 }
 
-// Attempt to fold some (truncate (srl (add X, C1), C2)) patterns to
-// (add (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we are able
-// to avoid generating code with MOVABS and large constants in certain cases.
-static SDValue combinei64TruncSrlAdd(SDValue N, EVT VT, SelectionDAG &DAG,
-                                     const SDLoc &DL) {
-  using namespace llvm::SDPatternMatch;
+// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
+// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
+// are able to avoid generating code with MOVABS and large constants in certain
+// cases.
+static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
+                                          const SDLoc &DL) {
 
-  SDValue AddLhs;
-  APInt AddConst, SrlConst;
-  if (VT != MVT::i32 ||
-      !sd_match(N, m_AllOf(m_SpecificVT(MVT::i64),
-                           m_Srl(m_OneUse(m_Add(m_Value(AddLhs),
-                                                m_ConstInt(AddConst))),
-                                 m_ConstInt(SrlConst)))))
-    return SDValue();
+  SDValue Op = N.getOperand(0);
+  APInt OpConst = Op.getConstantOperandAPInt(1);
+  APInt SrlConst = N.getConstantOperandAPInt(1);
+  uint64_t SrlConstVal = SrlConst.getZExtValue();
+  unsigned Opcode = Op.getOpcode();
 
-  if (SrlConst.ule(32) || AddConst.countr_zero() < SrlConst.getZExtValue())
+  if (SrlConst.ule(32) ||
+      (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
     return SDValue();
 
-  SDValue AddLHSSrl =
-      DAG.getNode(ISD::SRL, DL, MVT::i64, AddLhs, N.getOperand(1));
-  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AddLHSSrl);
+  SDValue OpLhsSrl =
+      DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
 
-  APInt NewAddConstVal = AddConst.lshr(SrlConst).trunc(VT.getSizeInBits());
-  SDValue NewAddConst = DAG.getConstant(NewAddConstVal, DL, VT);
-  SDValue NewAddNode = DAG.getNode(ISD::ADD, DL, VT, Trunc, NewAddConst);
+  APInt NewOpConstVal = OpConst.lshr(SrlConst).trunc(VT.getSizeInBits());
+  SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
+  SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
+  EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
 
-  EVT CleanUpVT =
-      EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConst.getZExtValue());
-  return DAG.getZeroExtendInReg(NewAddNode, DL, CleanUpVT);
+  if (Opcode == ISD::ADD)
+    return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
+  return NewOpNode;
 }
 
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
@@ -53865,11 +53864,21 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   if (!Src.hasOneUse())
     return SDValue();
 
-  if (SDValue R = combinei64TruncSrlAdd(Src, VT, DAG, DL))
-    return R;
+  if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL &&
+      isa<ConstantSDNode>(Src.getOperand(1))) {
+
+    unsigned SrcOpOpcode = Src.getOperand(0).getOpcode();
+    if ((SrcOpOpcode != ISD::ADD && SrcOpOpcode != ISD::OR &&
+         SrcOpOpcode != ISD::XOR) ||
+        !isa<ConstantSDNode>(Src.getOperand(0).getOperand(1)))
+      return SDValue();
+
+    if (SDValue R = combinei64TruncSrlConstant(Src, VT, DAG, DL))
+      return R;
+
+    return SDValue();
+  }
 
-  // Only support vector truncation for now.
-  // TODO: i64 scalar math would benefit as well.
   if (!VT.isVector())
     return SDValue();
 
diff --git a/llvm/test/CodeGen/X86/combine-i64-trunc-srl-add.ll b/llvm/test/CodeGen/X86/combine-i64-trunc-srl-add.ll
@@ -128,6 +128,103 @@ define i32 @test_trunc_add(i64 %x) {
   ret i32 %conv
 }
 
+define i32 @test_trunc_sub(i64 %x) {
+; X64-LABEL: test_trunc_sub:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $49, %rdi
+; X64-NEXT:    leal 32762(%rdi), %eax
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    retq
+  %sub = sub i64 %x, 3377699720527872
+  %shr = lshr i64 %sub, 49
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
+define i32 @test_trunc_and_1(i64 %x) {
+; X64-LABEL: test_trunc_and_1:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $50, %rax
+; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %and = and i64 %x, 3940649673949184
+  %shr = lshr i64 %and, 50
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
+define i32 @test_trunc_or_1(i64 %x) {
+; X64-LABEL: test_trunc_or_1:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $50, %rax
+; X64-NEXT:    orl $3, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %or = or i64 %x, 3940649673949184
+  %shr = lshr i64 %or, 50
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
+define i32 @test_trunc_xor_1(i64 %x) {
+; X64-LABEL: test_trunc_xor_1:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $50, %rax
+; X64-NEXT:    xorl $3, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %xor = xor i64 %x, 3940649673949184
+  %shr = lshr i64 %xor, 50
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
+define i32 @test_trunc_and_2(i64 %x) {
+; X64-LABEL: test_trunc_and_2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $45, %rax
+; X64-NEXT:    andl $111, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %and = and i64 %x, 3940649673949183
+  %shr = lshr i64 %and, 45
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
+define i32 @test_trunc_or_2(i64 %x) {
+; X64-LABEL: test_trunc_or_2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $45, %rax
+; X64-NEXT:    orl $111, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %or = or i64 %x, 3940649673949183
+  %shr = lshr i64 %or, 45
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
+define i32 @test_trunc_xor_2(i64 %x) {
+; X64-LABEL: test_trunc_xor_2:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $45, %rax
+; X64-NEXT:    xorl $111, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %xor = xor i64 %x, 3940649673949183
+  %shr = lshr i64 %xor, 45
+  %conv = trunc i64 %shr to i32
+  ret i32 %conv
+}
+
 ; Make sure we don't crash on this test case.
 
 define i32 @pr128158(i64 %x) {
@@ -137,10 +234,10 @@ define i32 @pr128158(i64 %x) {
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    shrq $32, %rax
 ; X64-NEXT:    .p2align 4
-; X64-NEXT:  .LBB9_1: # %for.body
+; X64-NEXT:  .LBB16_1: # %for.body
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    cmpl $9, %eax
-; X64-NEXT:    jb .LBB9_1
+; X64-NEXT:    jb .LBB16_1
 ; X64-NEXT:  # %bb.2: # %exit
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq