[llvm] [LoongArch] Custom legalize vector_shuffle to xvpermi.d when possible (PR #160429)

Thu Sep 25 23:45:21 PDT 2025

https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/160429

>From 80abdb7318bc7e1ff7bc4ea1f6ff0b1cc4a9813e Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Tue, 23 Sep 2025 16:02:12 +0800
Subject: [PATCH 1/3] [LoongArch] Refine 256-bit vector_shuffle legalization
 for LASX

---
 .../LoongArch/LoongArchISelLowering.cpp       | 73 +++++++++++--------
 .../lasx/ir-instruction/fix-xvshuf.ll         |  2 +-
 .../ir-instruction/shuffle-as-xvrepl128vei.ll |  7 +-
 3 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 32baa2d111270..9662fec5ef4b9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2060,7 +2060,10 @@ lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 
   const auto &Begin = Mask.begin();
   const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
+  int HalfSize = Mask.size() / 2;
+
+  if (SplatIndex >= HalfSize)
+    return SDValue();
 
   assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
   if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
@@ -2354,7 +2357,7 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
 /// cases need to be converted to it for processing.
 ///
 /// This function may modify V1, V2 and Mask
-static void canonicalizeShuffleVectorByLane(
+static bool canonicalizeShuffleVectorByLane(
     const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1,
     SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
 
@@ -2378,15 +2381,15 @@ static void canonicalizeShuffleVectorByLane(
     preMask = LowLaneTy;
 
   if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
-        return M < 0 || (M >= 0 && M < HalfSize) ||
-               (M >= MaskSize && M < MaskSize + HalfSize);
+        return M < 0 || (M >= HalfSize && M < MaskSize) ||
+               (M >= MaskSize + HalfSize && M < MaskSize * 2);
       }))
-    postMask = HighLaneTy;
+    postMask = LowLaneTy;
   else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
-             return M < 0 || (M >= HalfSize && M < MaskSize) ||
-                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+             return M < 0 || (M >= 0 && M < HalfSize) ||
+                    (M >= MaskSize && M < MaskSize + HalfSize);
            }))
-    postMask = LowLaneTy;
+    postMask = HighLaneTy;
 
   // The pre-half of mask is high lane type, and the post-half of mask
   // is low lane type, which is closest to the LoongArch instructions.
@@ -2395,7 +2398,7 @@ static void canonicalizeShuffleVectorByLane(
   // to the lower 128-bit of vector register, and the low lane of mask
   // corresponds the higher 128-bit of vector register.
   if (preMask == HighLaneTy && postMask == LowLaneTy) {
-    return;
+    return false;
   }
   if (preMask == LowLaneTy && postMask == HighLaneTy) {
     V1 = DAG.getBitcast(MVT::v4i64, V1);
@@ -2449,8 +2452,10 @@ static void canonicalizeShuffleVectorByLane(
       *it = *it < 0 ? *it : *it + HalfSize;
     }
   } else { // cross-lane
-    return;
+    return false;
   }
+
+  return true;
 }
 
 /// Lower VECTOR_SHUFFLE as lane permute and then shuffle (if possible).
@@ -2516,27 +2521,20 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   assert(Mask.size() % 2 == 0 && "Expected even mask size.");
   assert(Mask.size() >= 4 && "Mask size is less than 4.");
 
-  // canonicalize non cross-lane shuffle vector
-  SmallVector<int> NewMask(Mask);
-  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget);
-
   APInt KnownUndef, KnownZero;
-  computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero);
+  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
   APInt Zeroable = KnownUndef | KnownZero;
 
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG,
+    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, V2, DAG,
                                                 Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG,
+    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
                                                Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG)))
-      return Result;
-    if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
-                                                             V1, V2, DAG)))
+    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG)))
       return Result;
 
     // TODO: This comment may be enabled in the future to better match the
@@ -2546,24 +2544,39 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 
   // It is recommended not to change the pattern comparison order for better
   // performance.
-  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG,
-                                           Subtarget, Zeroable)))
+  if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
+                                           Zeroable)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG,
+  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
                                                 Subtarget)))
     return Result;
+
+  // canonicalize non cross-lane shuffle vector
+  SmallVector<int> NewMask(Mask);
+  if (canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget))
+    return lower256BitShuffle(DL, NewMask, VT, V1, V2, DAG, Subtarget);
+
+  // FIXME: Handling the remaining cases earlier can degrade performance
+  // in some situations. Further analysis is required to enable more
+  // effective optimizations.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
+                                                             V1, V2, DAG)))
+      return Result;
+  }
+
   if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG))
     return NewShuffle;
   if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index 765473ce166df..30539427a1a0a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -8,7 +8,7 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve.d $xr2, $xr1, 3
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 238
 ; CHECK-NEXT:    xvrepl128vei.d $xr3, $xr3, 1
 ; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
 ; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 2
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
index 935a30a3e54ed..69bb5899e685f 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
@@ -16,7 +16,6 @@ define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
 define <32 x i8> @shufflevector_v32i8_undef(<32 x i8> %a) {
 ; CHECK-LABEL: shufflevector_v32i8_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
 ; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
     %c = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
@@ -39,7 +38,6 @@ define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @shufflevector_v16i16_undef(<16 x i16> %a) {
 ; CHECK-LABEL: shufflevector_v16i16_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
 ; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
 ; CHECK-NEXT:    ret
     %c = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
@@ -62,7 +60,6 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i32> @shufflevector_v8i32_undef(<8 x i32> %a) {
 ; CHECK-LABEL: shufflevector_v8i32_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
 ; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 2
 ; CHECK-NEXT:    ret
     %c = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 8, i32 8, i32 8, i32 8>
@@ -83,7 +80,6 @@ define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @shufflevector_v4i64_undef(<4 x i64> %a) {
 ; CHECK-LABEL: shufflevector_v4i64_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
 ; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
     %c = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
@@ -104,7 +100,7 @@ define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @shufflevector_v8f32_undef(<8 x float> %a) {
 ; CHECK-LABEL: shufflevector_v8f32_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 238
 ; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
     %c = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 9, i32 9, i32 9, i32 9>
@@ -125,7 +121,6 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shufflevector_v4f64_undef(<4 x double> %a) {
 ; CHECK-LABEL: shufflevector_v4f64_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
 ; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 0
 ; CHECK-NEXT:    ret
     %c = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 7, i32 7>

>From 47274d059b280f7721486e4ea6e656adbfe4357f Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Wed, 24 Sep 2025 09:09:25 +0800
Subject: [PATCH 2/3] [LoongArch] Custom legalize vector_shuffle to xvpermi.d
 when possible

---
 .../LoongArch/LoongArchISelLowering.cpp       | 23 +++++++++
 .../lasx/ir-instruction/fix-xvshuf.ll         |  9 ++--
 .../lasx/shuffle-as-permute-and-shuffle.ll    | 48 ++++---------------
 .../LoongArch/lasx/vec-shuffle-byte-rotate.ll |  4 +-
 4 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 9662fec5ef4b9..7e313f0b90d34 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2088,6 +2088,26 @@ lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget);
 }
 
+/// Lower VECTOR_SHUFFLE into XVPERMI (if possible).
+static SDValue
+lowerVECTOR_SHUFFLE_XVPERMI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                            SDValue V1, SDValue V2, SelectionDAG &DAG,
+                            const LoongArchSubtarget &Subtarget) {
+  // Only consider XVPERMI_D.
+  if (Mask.size() != 4 || (VT != MVT::v4i64 && VT != MVT::v4f64))
+    return SDValue();
+
+  unsigned MaskImm = 0;
+  for (unsigned i = 0; i < Mask.size(); ++i) {
+    if (Mask[i] == -1)
+      continue;
+    MaskImm |= Mask[i] << (i * 2);
+  }
+
+  return DAG.getNode(LoongArchISD::XVPERMI, DL, VT, V1,
+                     DAG.getConstant(MaskImm, DL, Subtarget.getGRLenVT()));
+}
+
 /// Lower VECTOR_SHUFFLE into XVPERM (if possible).
 static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
                                           MVT VT, SDValue V1, SDValue V2,
@@ -2534,6 +2554,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
                                                Subtarget)))
       return Result;
+    if ((Result =
+             lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, V2, DAG, Subtarget)))
+      return Result;
     if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG)))
       return Result;
 
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index 30539427a1a0a..0b8015ddbdd4a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,13 +7,12 @@
 define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve.d $xr2, $xr1, 3
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 238
-; CHECK-NEXT:    xvrepl128vei.d $xr3, $xr3, 1
-; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 3
+; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT:    vextrins.d $vr2, $vr3, 16
 ; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 2
 ; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
 ; CHECK-NEXT:    ret
 entry:
   %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 5f76d9951df9c..ee1e9f4ce4e5c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -5,11 +5,8 @@ define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: shuffle_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
 ; CHECK-NEXT:    xvshuf.h $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
@@ -33,11 +30,8 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: shuffle_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
 ; CHECK-NEXT:    xvshuf.w $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
@@ -71,10 +65,7 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32_same_lane:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 225
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -83,14 +74,7 @@ define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) {
 define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: shuffle_v4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 39
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   ret <4 x i64> %shuffle
@@ -99,10 +83,7 @@ define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
 define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
 ; CHECK-LABEL: shuffle_v4i64_same_lane:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 225
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
   ret <4 x i64> %shuffle
@@ -135,14 +116,7 @@ define <8 x float> @shuffle_v8f32_same_lane(<8 x float> %a) {
 define <4 x double> @shuffle_v4f64(<4 x double> %a) {
 ; CHECK-LABEL: shuffle_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 39
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   ret <4 x double> %shuffle
@@ -151,11 +125,7 @@ define <4 x double> @shuffle_v4f64(<4 x double> %a) {
 define <4 x double> @shuffle_v4f64_same_lane(<4 x double> %a) {
 ; CHECK-LABEL: shuffle_v4f64_same_lane:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 75
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   ret <4 x double> %shuffle
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
index b697a2fd07435..eaf33d46a8803 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
@@ -126,9 +126,7 @@ define <4 x i64> @byte_rotate_v4i64_2(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <4 x i64> @byte_rotate_v4i64_3(<4 x i64> %a) nounwind {
 ; CHECK-LABEL: byte_rotate_v4i64_3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr0, 8
-; CHECK-NEXT:    xvbsll.v $xr0, $xr0, 8
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 177
 ; CHECK-NEXT:    ret
     %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
     ret <4 x i64> %shuffle

>From bd5608f2ab4fa75e669cee0135c05e9a3b703fc5 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Thu, 25 Sep 2025 09:35:34 +0800
Subject: [PATCH 3/3] address heiher's comment

---
 .../LoongArch/LoongArchISelLowering.cpp       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7e313f0b90d34..40c8db5c499b9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1596,7 +1596,7 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
 /// value is necessary in order to fit the above form.
 static SDValue
 lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                             SDValue V1, SDValue V2, SelectionDAG &DAG,
+                             SDValue V1, SelectionDAG &DAG,
                              const LoongArchSubtarget &Subtarget) {
   int SplatIndex = -1;
   for (const auto &M : Mask) {
@@ -1989,8 +1989,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG,
-                                               Subtarget)))
+    if ((Result =
+             lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
     if ((Result =
              lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
@@ -2045,7 +2045,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 /// value is necessary in order to fit the above form.
 static SDValue
 lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                              SDValue V1, SDValue V2, SelectionDAG &DAG,
+                              SDValue V1, SelectionDAG &DAG,
                               const LoongArchSubtarget &Subtarget) {
   int SplatIndex = -1;
   for (const auto &M : Mask) {
@@ -2091,7 +2091,7 @@ lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 /// Lower VECTOR_SHUFFLE into XVPERMI (if possible).
 static SDValue
 lowerVECTOR_SHUFFLE_XVPERMI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                            SDValue V1, SDValue V2, SelectionDAG &DAG,
+                            SDValue V1, SelectionDAG &DAG,
                             const LoongArchSubtarget &Subtarget) {
   // Only consider XVPERMI_D.
   if (Mask.size() != 4 || (VT != MVT::v4i64 && VT != MVT::v4f64))
@@ -2110,7 +2110,7 @@ lowerVECTOR_SHUFFLE_XVPERMI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 
 /// Lower VECTOR_SHUFFLE into XVPERM (if possible).
 static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
-                                          MVT VT, SDValue V1, SDValue V2,
+                                          MVT VT, SDValue V1,
                                           SelectionDAG &DAG) {
   // LoongArch LASX only have XVPERM_W.
   if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
@@ -2548,16 +2548,16 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, V2, DAG,
-                                                Subtarget)))
+    if ((Result =
+             lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
     if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
                                                Subtarget)))
       return Result;
     if ((Result =
-             lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, V2, DAG, Subtarget)))
+             lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG)))
+    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG)))
       return Result;
 
     // TODO: This comment may be enabled in the future to better match the