[llvm-branch-commits] [llvm] [LoongArch] Use xvperm.w for cross-lane access within a single vector (PR #151634)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Aug 9 04:13:38 PDT 2025
https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/151634
>From f759464ee797830c998d66d1076d98933336c5a1 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 1 Aug 2025 11:30:19 +0800
Subject: [PATCH 1/2] [LoongArch] Use xvperm.w for cross-lane access within a
single vector
---
.../LoongArch/LoongArchISelLowering.cpp | 44 +++++++++++++++++++
.../lasx/shuffle-as-permute-and-shuffle.ll | 18 ++------
2 files changed, 48 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 597650c8229a7..6aa848ca7bd07 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1832,6 +1832,48 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
}
+/// Lower VECTOR_SHUFFLE into XVPERM (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ // LoongArch LASX only have XVPERM_W.
+ if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfSize = NumElts / 2;
+ bool FrontLo = true, FrontHi = true;
+ bool BackLo = true, BackHi = true;
+
+ auto inRange = [](int val, int low, int high) {
+ return (val == -1) || (val >= low && val < high);
+ };
+
+ for (unsigned i = 0; i < HalfSize; ++i) {
+ int Fronti = Mask[i];
+ int Backi = Mask[i + HalfSize];
+
+ FrontLo &= inRange(Fronti, 0, HalfSize);
+ FrontHi &= inRange(Fronti, HalfSize, NumElts);
+ BackLo &= inRange(Backi, 0, HalfSize);
+ BackHi &= inRange(Backi, HalfSize, NumElts);
+ }
+
+ // If both the lower and upper 128-bit parts access only one half of the
+ // vector (either lower or upper), avoid using xvperm.w. The latency of
+ // xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
+ if ((FrontLo && (BackLo || BackHi)) || (FrontHi && (BackLo || BackHi)))
+ return SDValue();
+
+ SmallVector<SDValue, 8> Masks;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64)
+ : DAG.getConstant(Mask[i], DL, MVT::i64));
+ SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks);
+
+ return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec);
+}
+
/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
@@ -2235,6 +2277,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG)))
+ return Result;
if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
V1, V2, DAG)))
return Result;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index fed085843485a..5f76d9951df9c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -61,13 +61,8 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
; CHECK-LABEL: shuffle_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI4_1)
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -117,13 +112,8 @@ define <8 x float> @shuffle_v8f32(<8 x float> %a) {
; CHECK-LABEL: shuffle_v8f32:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI8_0)
+; CHECK-NEXT: xvperm.w $xr0, $xr0, $xr1
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
>From f934beb1436e5c519df78fdab0e3e94268f1e30f Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 1 Aug 2025 12:04:02 +0800
Subject: [PATCH 2/2] opt code style
---
llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6aa848ca7bd07..1d8998ad90ddf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1862,7 +1862,7 @@ static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
// If both the lower and upper 128-bit parts access only one half of the
// vector (either lower or upper), avoid using xvperm.w. The latency of
// xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
- if ((FrontLo && (BackLo || BackHi)) || (FrontHi && (BackLo || BackHi)))
+ if ((FrontLo || FrontHi) && (BackLo || BackHi))
return SDValue();
SmallVector<SDValue, 8> Masks;
More information about the llvm-branch-commits
mailing list