Skip to content

Conversation

zhaoqi5
Copy link
Contributor

@zhaoqi5 zhaoqi5 commented Jul 31, 2025

Ideas suggested by: @heiher @tangaac

@llvmbot
Copy link
Member

llvmbot commented Jul 31, 2025

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/151475.diff

4 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+18-1)
  • (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.h (+1)
  • (modified) llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td (+10)
  • (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll (+8-28)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a5bf0e57e3053..4f534f1666eaa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2608,13 +2608,29 @@ SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
 SDValue
 LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  EVT VecTy = Op->getOperand(0)->getValueType(0);
+  MVT EltVT = Op.getSimpleValueType();
+  SDValue Vec = Op->getOperand(0);
+  EVT VecTy = Vec->getValueType(0);
   SDValue Idx = Op->getOperand(1);
   unsigned NumElts = VecTy.getVectorNumElements();
+  SDLoc DL(Op);
+
+  assert(VecTy.is256BitVector() && "Unexpected EXTRACT_VECTOR_ELT vector type");
 
   if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
     return Op;
 
+  // TODO: Deal with other legal 256-bits vector types?
+  if (!isa<ConstantSDNode>(Idx) &&
+      (VecTy == MVT::v8i32 || VecTy == MVT::v8f32)) {
+    SDValue SplatIdx = DAG.getSplatBuildVector(MVT::v8i32, DL, Idx);
+    SDValue SplatValue =
+        DAG.getNode(LoongArchISD::XVPERM, DL, VecTy, Vec, SplatIdx);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SplatValue,
+                       DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
+  }
+
   return SDValue();
 }
 
@@ -6632,6 +6648,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VREPLVEI)
     NODE_NAME_CASE(VREPLGR2VR)
     NODE_NAME_CASE(XVPERMI)
+    NODE_NAME_CASE(XVPERM)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6b49a98f3ae46..32a695825342e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -141,6 +141,7 @@ enum NodeType : unsigned {
   VREPLVEI,
   VREPLGR2VR,
   XVPERMI,
+  XVPERM,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5096a8fcda8eb..7f646ad0d6fdc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,8 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                        SDTCisVec<2>, SDTCisInt<2>]>;
+
 // Target nodes.
 def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
+def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
 def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1835,6 +1839,12 @@ def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
 def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
           (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
 
+// XVPERM_W
+def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk),
+          (XVPERM_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk),
+          (XVPERM_W v8f32:$xj, v8i32:$xk)>;
+
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 2e1618748688a..b191a9d08ab2d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -126,21 +126,11 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -176,21 +166,11 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT:    fld.s $fa0, $a0, 0
-; CHECK-NEXT:    fst.s $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx

@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Jul 31, 2025

Any idea for other 256-bits types?

@zhaoqi5 zhaoqi5 requested review from tangaac and SixWeining July 31, 2025 09:22
@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-extractelement-idx branch from de080ca to 80ccc27 Compare August 1, 2025 13:10
@zhaoqi5 zhaoqi5 requested a review from SixWeining August 1, 2025 13:11
@tangaac
Copy link
Member

tangaac commented Aug 4, 2025

@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-extractelement-idx branch from d764815 to f8b7d4c Compare August 9, 2025 11:07
@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Sep 2, 2025

Ping. Updates made after last approval.

@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-extractelement-idx branch from f8b7d4c to 2ffde95 Compare September 2, 2025 09:05
@heiher
Copy link
Member

heiher commented Sep 2, 2025

Are these okay?

xr0: v32i8
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.b   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.b        xr2, xr1, xr0, xr2 1/2
Total:                             7/1

xr0: v16i16
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.h   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.h        xr2, xr1, xr0      1/2
Total:                             7/1

xr0: v4i64
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.d   xr2, a0            2/1
xvshuf.d        xr2, xr1, xr0      1/2
Total:                             6/1

@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Sep 3, 2025

Are these okay?

xr0: v32i8
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.b   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.b        xr2, xr1, xr0, xr2 1/2
Total:                             7/1

xr0: v16i16
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.h   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.h        xr2, xr1, xr0      1/2
Total:                             7/1

xr0: v4i64
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.d   xr2, a0            2/1
xvshuf.d        xr2, xr1, xr0      1/2
Total:                             6/1

I think it is right and better. My thought process was entirely limited by xvperm.w. Thanks for your idea.

What do you think about this? @tangaac

@tangaac
Copy link
Member

tangaac commented Sep 3, 2025

Are these okay?

xr0: v32i8
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.b   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.b        xr2, xr1, xr0, xr2 1/2
Total:                             7/1

xr0: v16i16
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.h   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.h        xr2, xr1, xr0      1/2
Total:                             7/1

xr0: v4i64
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.d   xr2, a0            2/1
xvshuf.d        xr2, xr1, xr0      1/2
Total:                             6/1

I think it is right and better. My thought process was entirely limited by xvperm.w. Thanks for your idea.

What do you think about this? @tangaac

It shows better performance.

@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Sep 3, 2025

Updated. @heiher @tangaac . Please take a look when you are free.

Copy link
Member

@heiher heiher left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Thanks!

; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 96
; CHECK-NEXT: xvreplgr2vr.w $xr1, $a2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM
It seems this xvreplgr2vr could be replaced.
Fot LSX, your optimization may work too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems this xvreplgr2vr could be replaced.

xvreplgr2vr.w and movgr2fr.w have the same latency and throughput.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We indeed can use movgr2fr+vshuf to replace vreplve+movfr2gr for LSX. Thank you for mentioning it. I will push a new PR to do this.

@zhaoqi5 zhaoqi5 changed the title [LoongArch] Optimize extractelement containing variable index [LoongArch] Optimize extractelement containing variable index for lasx Sep 3, 2025
@zhaoqi5 zhaoqi5 merged commit c507848 into main Sep 4, 2025
9 checks passed
@zhaoqi5 zhaoqi5 deleted the users/zhaoqi5/opt-extractelement-idx branch September 4, 2025 01:27
@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 4, 2025

LLVM Buildbot has detected a new failure on builder clang-aarch64-quick running on linaro-clang-aarch64-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/22141

Here is the relevant piece of the build log for the reference
Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'lit :: max-time.py' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 5
env -u FILECHECK_OPTS "/usr/bin/python3.10" /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical Inputs/max-time --max-time=5 2>&1  |  FileCheck /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py
# executed command: env -u FILECHECK_OPTS /usr/bin/python3.10 /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical Inputs/max-time --max-time=5
# executed command: FileCheck /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py
# .---command stderr------------
# | /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py:8:10: error: CHECK: expected string not found in input
# | # CHECK: Skipped: 1
# |          ^
# | <stdin>:2:51: note: scanning from here
# | warning: reached timeout, skipping remaining tests
# |                                                   ^
# | <stdin>:7:2: note: possible intended match here
# |  Skipped: 2 (100.00%)
# |  ^
# | 
# | Input file: <stdin>
# | Check file: /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |            1: -- Testing: 2 tests, 1 workers -- 
# |            2: warning: reached timeout, skipping remaining tests 
# | check:8'0                                                       X error: no match found
# |            3:  
# | check:8'0     ~
# |            4: Testing Time: 7.07s 
# | check:8'0     ~~~~~~~~~~~~~~~~~~~~
# |            5:  
# | check:8'0     ~
# |            6: Total Discovered Tests: 2 
# | check:8'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            7:  Skipped: 2 (100.00%) 
# | check:8'0     ~~~~~~~~~~~~~~~~~~~~~~
# | check:8'1      ?                     possible intended match
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

********************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants