[LoongArch] Optimize extractelement containing variable index for lasx #151475

zhaoqi5 · 2025-07-31T09:17:06Z

Ideas suggested by: @heiher @tangaac

llvmbot · 2025-07-31T09:17:40Z

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/151475.diff

4 Files Affected:

(modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+18-1)
(modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.h (+1)
(modified) llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td (+10)
(modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll (+8-28)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a5bf0e57e3053..4f534f1666eaa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2608,13 +2608,29 @@ SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
 SDValue
 LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  EVT VecTy = Op->getOperand(0)->getValueType(0);
+  MVT EltVT = Op.getSimpleValueType();
+  SDValue Vec = Op->getOperand(0);
+  EVT VecTy = Vec->getValueType(0);
   SDValue Idx = Op->getOperand(1);
   unsigned NumElts = VecTy.getVectorNumElements();
+  SDLoc DL(Op);
+
+  assert(VecTy.is256BitVector() && "Unexpected EXTRACT_VECTOR_ELT vector type");
 
   if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
     return Op;
 
+  // TODO: Deal with other legal 256-bits vector types?
+  if (!isa<ConstantSDNode>(Idx) &&
+      (VecTy == MVT::v8i32 || VecTy == MVT::v8f32)) {
+    SDValue SplatIdx = DAG.getSplatBuildVector(MVT::v8i32, DL, Idx);
+    SDValue SplatValue =
+        DAG.getNode(LoongArchISD::XVPERM, DL, VecTy, Vec, SplatIdx);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SplatValue,
+                       DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
+  }
+
   return SDValue();
 }
 
@@ -6632,6 +6648,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VREPLVEI)
     NODE_NAME_CASE(VREPLGR2VR)
     NODE_NAME_CASE(XVPERMI)
+    NODE_NAME_CASE(XVPERM)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6b49a98f3ae46..32a695825342e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -141,6 +141,7 @@ enum NodeType : unsigned {
   VREPLVEI,
   VREPLGR2VR,
   XVPERMI,
+  XVPERM,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5096a8fcda8eb..7f646ad0d6fdc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,8 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                        SDTCisVec<2>, SDTCisInt<2>]>;
+
 // Target nodes.
 def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
+def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
 def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1835,6 +1839,12 @@ def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
 def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
           (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
 
+// XVPERM_W
+def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk),
+          (XVPERM_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk),
+          (XVPERM_W v8f32:$xj, v8i32:$xk)>;
+
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 2e1618748688a..b191a9d08ab2d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -126,21 +126,11 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -176,21 +166,11 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT:    fld.s $fa0, $a0, 0
-; CHECK-NEXT:    fst.s $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx

zhaoqi5 · 2025-07-31T09:18:00Z

Any idea for other 256-bits types?

llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll

tangaac · 2025-08-04T02:13:00Z

tangaac/loong-opt-cov-ts@934517a

zhaoqi5 · 2025-09-02T03:36:35Z

Ping. Updates made after last approval.

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

heiher · 2025-09-02T12:27:00Z

Are these okay?

xr0: v32i8
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.b   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.b        xr2, xr1, xr0, xr2 1/2
Total:                             7/1

xr0: v16i16
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.h   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.h        xr2, xr1, xr0      1/2
Total:                             7/1

xr0: v4i64
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.d   xr2, a0            2/1
xvshuf.d        xr2, xr1, xr0      1/2
Total:                             6/1

zhaoqi5 · 2025-09-03T01:26:06Z

Are these okay?

xr0: v32i8
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.b   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.b        xr2, xr1, xr0, xr2 1/2
Total:                             7/1

xr0: v16i16
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.h   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.h        xr2, xr1, xr0      1/2
Total:                             7/1

xr0: v4i64
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.d   xr2, a0            2/1
xvshuf.d        xr2, xr1, xr0      1/2
Total:                             6/1

I think it is right and better. My thought process was entirely limited by xvperm.w. Thanks for your idea.

What do you think about this? @tangaac

tangaac · 2025-09-03T03:03:41Z

Are these okay?

xr0: v32i8
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.b   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.b        xr2, xr1, xr0, xr2 1/2
Total:                             7/1

xr0: v16i16
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.h   xr2, a0            3/1  // movgr2fr.w    f2, a0    2/1
xvshuf.h        xr2, xr1, xr0      1/2
Total:                             7/1

xr0: v4i64
a0 : index
                                   L/T
xvpermi.q       xr1, xr0, 1        3/4
xvreplgr2vr.d   xr2, a0            2/1
xvshuf.d        xr2, xr1, xr0      1/2
Total:                             6/1

I think it is right and better. My thought process was entirely limited by xvperm.w. Thanks for your idea.

What do you think about this? @tangaac

It shows better performance.

zhaoqi5 · 2025-09-03T07:10:28Z

Updated. @heiher @tangaac . Please take a look when you are free.

heiher

LGTM. Thanks!

tangaac · 2025-09-03T08:15:23Z

llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll

-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a2


LGTM
It seems this xvreplgr2vr could be replaced.
Fot LSX, your optimization may work too.

It seems this xvreplgr2vr could be replaced.

xvreplgr2vr.w and movgr2fr.w have the same latency and throughput.

We indeed can use movgr2fr+vshuf to replace vreplve+movfr2gr for LSX. Thank you for mentioning it. I will push a new PR to do this.

llvm-ci · 2025-09-04T01:41:44Z

LLVM Buildbot has detected a new failure on builder clang-aarch64-quick running on linaro-clang-aarch64-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/22141

Here is the relevant piece of the build log for the reference

Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'lit :: max-time.py' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 5
env -u FILECHECK_OPTS "/usr/bin/python3.10" /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical Inputs/max-time --max-time=5 2>&1  |  FileCheck /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py
# executed command: env -u FILECHECK_OPTS /usr/bin/python3.10 /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical Inputs/max-time --max-time=5
# executed command: FileCheck /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py
# .---command stderr------------
# | /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py:8:10: error: CHECK: expected string not found in input
# | # CHECK: Skipped: 1
# |          ^
# | <stdin>:2:51: note: scanning from here
# | warning: reached timeout, skipping remaining tests
# |                                                   ^
# | <stdin>:7:2: note: possible intended match here
# |  Skipped: 2 (100.00%)
# |  ^
# | 
# | Input file: <stdin>
# | Check file: /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/max-time.py
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |            1: -- Testing: 2 tests, 1 workers -- 
# |            2: warning: reached timeout, skipping remaining tests 
# | check:8'0                                                       X error: no match found
# |            3:  
# | check:8'0     ~
# |            4: Testing Time: 7.07s 
# | check:8'0     ~~~~~~~~~~~~~~~~~~~~
# |            5:  
# | check:8'0     ~
# |            6: Total Discovered Tests: 2 
# | check:8'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            7:  Skipped: 2 (100.00%) 
# | check:8'0     ~~~~~~~~~~~~~~~~~~~~~~
# | check:8'1      ?                     possible intended match
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

********************

llvmbot added the backend:loongarch label Jul 31, 2025

zhaoqi5 requested review from tangaac and SixWeining July 31, 2025 09:22

SixWeining approved these changes Aug 1, 2025

View reviewed changes

SixWeining reviewed Aug 1, 2025

View reviewed changes

llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td Outdated Show resolved Hide resolved

zhaoqi5 force-pushed the users/zhaoqi5/opt-extractelement-idx branch from de080ca to 80ccc27 Compare August 1, 2025 13:10

zhaoqi5 requested a review from SixWeining August 1, 2025 13:11

SixWeining reviewed Aug 4, 2025

View reviewed changes

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp Outdated Show resolved Hide resolved

tangaac reviewed Aug 4, 2025

View reviewed changes

llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll Outdated Show resolved Hide resolved

tangaac approved these changes Aug 4, 2025

View reviewed changes

zhaoqi5 force-pushed the users/zhaoqi5/opt-extractelement-idx branch from d764815 to f8b7d4c Compare August 9, 2025 11:07

heiher reviewed Sep 2, 2025

View reviewed changes

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp Outdated Show resolved Hide resolved

heiher reviewed Sep 2, 2025

View reviewed changes

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp Outdated Show resolved Hide resolved

zhaoqi5 added 6 commits September 2, 2025 16:55

[LoongArch] Optimize extractelement containing variable index

a194851

deal with other lasx types

536fab8

add comments

465a1c0

address comments

7d46a1a

perform combine for extract_vector_elt

86ced14

address comments: using arithmetic right shift

2ffde95

zhaoqi5 force-pushed the users/zhaoqi5/opt-extractelement-idx branch from f8b7d4c to 2ffde95 Compare September 2, 2025 09:05

address comments: remove redundant andi

94af475

zhaoqi5 added 2 commits September 3, 2025 14:13

better choice

e0b1b66

use movgr2fr

4ad7f8a

heiher approved these changes Sep 3, 2025

View reviewed changes

tangaac reviewed Sep 3, 2025

View reviewed changes

zhaoqi5 changed the title ~~[LoongArch] Optimize extractelement containing variable index~~ [LoongArch] Optimize extractelement containing variable index for lasx Sep 3, 2025

Merge branch 'main' into users/zhaoqi5/opt-extractelement-idx

58cb967

zhaoqi5 merged commit c507848 into main Sep 4, 2025
9 checks passed

zhaoqi5 deleted the users/zhaoqi5/opt-extractelement-idx branch September 4, 2025 01:27

[LoongArch] Optimize extractelement containing variable index for lasx #151475

[LoongArch] Optimize extractelement containing variable index for lasx #151475

Uh oh!

Conversation

zhaoqi5 commented Jul 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 31, 2025

Uh oh!

zhaoqi5 commented Jul 31, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

tangaac commented Aug 4, 2025

Uh oh!

zhaoqi5 commented Sep 2, 2025

Uh oh!

Uh oh!

Uh oh!

heiher commented Sep 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

zhaoqi5 commented Sep 3, 2025

Uh oh!

tangaac commented Sep 3, 2025

Uh oh!

zhaoqi5 commented Sep 3, 2025

Uh oh!

heiher left a comment

Choose a reason for hiding this comment

Uh oh!

tangaac Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

heiher Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

tangaac Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

zhaoqi5 Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Sep 4, 2025

Uh oh!

Uh oh!

zhaoqi5 commented Jul 31, 2025 •

edited

Loading

heiher commented Sep 2, 2025 •

edited

Loading