[llvm-branch-commits] [llvm] [LoongArch] Use xvperm.w for cross-lane access within a single vector (PR #151634)

Sat Aug 9 04:13:38 PDT 2025

https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/151634

>From f759464ee797830c998d66d1076d98933336c5a1 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 1 Aug 2025 11:30:19 +0800
Subject: [PATCH 1/2] [LoongArch] Use xvperm.w for cross-lane access within a
 single vector

---
 .../LoongArch/LoongArchISelLowering.cpp       | 44 +++++++++++++++++++
 .../lasx/shuffle-as-permute-and-shuffle.ll    | 18 ++------
 2 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 597650c8229a7..6aa848ca7bd07 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1832,6 +1832,48 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
   return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
 }
 
+/// Lower VECTOR_SHUFFLE into XVPERM (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+  // LoongArch LASX only have XVPERM_W.
+  if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts / 2;
+  bool FrontLo = true, FrontHi = true;
+  bool BackLo = true, BackHi = true;
+
+  auto inRange = [](int val, int low, int high) {
+    return (val == -1) || (val >= low && val < high);
+  };
+
+  for (unsigned i = 0; i < HalfSize; ++i) {
+    int Fronti = Mask[i];
+    int Backi = Mask[i + HalfSize];
+
+    FrontLo &= inRange(Fronti, 0, HalfSize);
+    FrontHi &= inRange(Fronti, HalfSize, NumElts);
+    BackLo &= inRange(Backi, 0, HalfSize);
+    BackHi &= inRange(Backi, HalfSize, NumElts);
+  }
+
+  // If both the lower and upper 128-bit parts access only one half of the
+  // vector (either lower or upper), avoid using xvperm.w. The latency of
+  // xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
+  if ((FrontLo && (BackLo || BackHi)) || (FrontHi && (BackLo || BackHi)))
+    return SDValue();
+
+  SmallVector<SDValue, 8> Masks;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64)
+                                  : DAG.getConstant(Mask[i], DL, MVT::i64));
+  SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks);
+
+  return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec);
+}
+
 /// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
 static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
                                             MVT VT, SDValue V1, SDValue V2,
@@ -2235,6 +2277,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
       return Result;
     if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
       return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
     if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
                                                              V1, V2, DAG)))
       return Result;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index fed085843485a..5f76d9951df9c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -61,13 +61,8 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI4_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -117,13 +112,8 @@ define <8 x float> @shuffle_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: shuffle_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI8_0)
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle

>From f934beb1436e5c519df78fdab0e3e94268f1e30f Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 1 Aug 2025 12:04:02 +0800
Subject: [PATCH 2/2] opt code style

---
 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6aa848ca7bd07..1d8998ad90ddf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1862,7 +1862,7 @@ static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
   // If both the lower and upper 128-bit parts access only one half of the
   // vector (either lower or upper), avoid using xvperm.w. The latency of
   // xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
-  if ((FrontLo && (BackLo || BackHi)) || (FrontHi && (BackLo || BackHi)))
+  if ((FrontLo || FrontHi) && (BackLo || BackHi))
     return SDValue();
 
   SmallVector<SDValue, 8> Masks;