[llvm] [LoongArch] lower vector shuffle to shift if possible (PR #132866)

via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 2 02:11:44 PDT 2025


https://github.com/tangaac updated https://github.com/llvm/llvm-project/pull/132866

>From 1a8d72da6047e49bdc543013283e02b10fd10b43 Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Tue, 25 Mar 2025 09:52:14 +0800
Subject: [PATCH 1/3] lower vector shuffle to shift

---
 .../LoongArch/LoongArchISelLowering.cpp       | 135 ++++++++-
 .../Target/LoongArch/LoongArchISelLowering.h  |  10 +-
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  50 ++++
 .../CodeGen/LoongArch/lsx/build-vector.ll     |   7 +-
 .../LoongArch/lsx/vec-shuffle-bit-shift.ll    | 118 ++------
 .../LoongArch/lsx/vec-shuffle-byte-shift.ll   | 276 ++++--------------
 6 files changed, 263 insertions(+), 333 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 900775eedfa7b..772e936ac19e9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -525,6 +525,121 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
   }
 }
 
+/// Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI
+/// instruction.
+// The funciton matches elements form one of the input vector shuffled to the
+// left or right with zeroable elements 'shifted in'. It handles both the
+// strictly bit-wise element shifts and the byte shfit across an entire 128-bit
+// lane.
+// This is mainly copy from X86.
+static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+                               unsigned ScalarSizeInBits, ArrayRef<int> Mask,
+                               int MaskOffset, const APInt &Zeroable) {
+  int Size = Mask.size();
+  unsigned SizeInBits = Size * ScalarSizeInBits;
+
+  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i < Size; i += Scale)
+      for (int j = 0; j < Shift; ++j)
+        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+          return false;
+
+    return true;
+  };
+
+  auto isSequentialOrUndefInRange = [&](unsigned Pos, unsigned Size, int Low,
+                                        int Step = 1) {
+    for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
+      if (!(Mask[i] == -1 || Mask[i] == Low))
+        return false;
+    return true;
+  };
+
+  auto MatchShift = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = Left ? i + Shift : i;
+      unsigned Low = Left ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      if (!isSequentialOrUndefInRange(Pos, Len, Low + MaskOffset))
+        return -1;
+    }
+
+    int ShiftEltBits = ScalarSizeInBits * Scale;
+    bool ByteShift = ShiftEltBits > 64;
+    Opcode = Left ? (ByteShift ? LoongArchISD::VBSLL : LoongArchISD::VSLLI)
+                  : (ByteShift ? LoongArchISD::VBSRL : LoongArchISD::VSRLI);
+    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+    // Normalize the scale for byte shifts to still produce an i64 element
+    // type.
+    Scale = ByteShift ? Scale / 2 : Scale;
+
+    // We need to round trip through the appropriate type for the shift.
+    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+                        : MVT::getVectorVT(ShiftSVT, Size / Scale);
+    return (int)ShiftAmt;
+  };
+
+  unsigned MaxWidth = 128;
+  for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; ++Shift)
+      for (bool Left : {true, false})
+        if (CheckZeros(Shift, Scale, Left)) {
+          int ShiftAmt = MatchShift(Shift, Scale, Left);
+          if (0 < ShiftAmt)
+            return ShiftAmt;
+        }
+
+  // no match
+  return -1;
+}
+
+/// Lower VECTOR_SHUFFLE as shift (if possible).
+///
+/// For example:
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+/// is lowered to:
+///     (VBSLL_V $v0, $v0, 4)
+///
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+/// is lowered to:
+///     (VSLLI_D $v0, $v0, 32)
+static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG,
+                                          const APInt &Zeroable) {
+  int Size = Mask.size();
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  MVT ShiftVT;
+  SDValue V = V1;
+  unsigned Opcode;
+
+  // Try to match shuffle against V1 shift.
+  int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                     Mask, 0, Zeroable);
+
+  // If V1 failed, try to match shuffle against V2 shift.
+  if (ShiftAmt < 0) {
+    ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                   Mask, Size, Zeroable);
+    V = V2;
+  }
+
+  if (ShiftAmt < 0)
+    return SDValue();
+
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+         "Illegal integer vector type");
+  V = DAG.getBitcast(ShiftVT, V);
+  V = DAG.getNode(Opcode, DL, ShiftVT, V,
+                  DAG.getConstant(ShiftAmt, DL, MVT::i64));
+  return DAG.getBitcast(VT, V);
+}
+
 /// Determine whether a range fits a regular pattern of values.
 /// This function accounts for the possibility of jumping over the End iterator.
 template <typename ValType>
@@ -593,14 +708,12 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
 static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
                                                     ArrayRef<int> Mask, MVT VT,
                                                     SDValue V1, SDValue V2,
-                                                    SelectionDAG &DAG) {
+                                                    SelectionDAG &DAG,
+                                                    const APInt &Zeroable) {
   int Bits = VT.getSizeInBits();
   int EltBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
 
-  APInt KnownUndef, KnownZero;
-  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
-  APInt Zeroable = KnownUndef | KnownZero;
   if (Zeroable.isAllOnes())
     return DAG.getConstant(0, DL, VT);
 
@@ -1062,6 +1175,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
          "Unexpected mask size for shuffle!");
   assert(Mask.size() % 2 == 0 && "Expected even mask size.");
 
+  APInt KnownUndef, KnownZero;
+  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+  APInt Zeroable = KnownUndef | KnownZero;
+
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
@@ -1089,12 +1206,14 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
+  if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
+                                                     Zeroable)))
+    return Result;
   if ((Result =
-           lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+           lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-
   return SDValue();
 }
 
@@ -5041,6 +5160,10 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VANY_NONZERO)
     NODE_NAME_CASE(FRECIPE)
     NODE_NAME_CASE(FRSQRTE)
+    NODE_NAME_CASE(VSLLI)
+    NODE_NAME_CASE(VSRLI)
+    NODE_NAME_CASE(VBSLL)
+    NODE_NAME_CASE(VBSRL)
   }
 #undef NODE_NAME_CASE
   return nullptr;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 002fad0e20759..52d88b9b24a6b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -147,7 +147,15 @@ enum NodeType : unsigned {
 
   // Floating point approximate reciprocal operation
   FRECIPE,
-  FRSQRTE
+  FRSQRTE,
+
+  // Vector logicial left / right shift by immediate
+  VSLLI,
+  VSRLI,
+
+  // Vector byte logicial left / right shift
+  VBSLL,
+  VBSRL
 
   // Intrinsic operations end =============================================
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index e37de4f545a2a..d22b474692bda 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -58,6 +58,12 @@ def loongarch_vreplgr2vr: SDNode<"LoongArchISD::VREPLGR2VR", SDT_LoongArchVreplg
 def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>;
 def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>;
 
+def loongarch_vslli : SDNode<"LoongArchISD::VSLLI", SDT_LoongArchV1RUimm>;
+def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>;
+
+def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>;
+def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>;
+
 def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
 def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
@@ -1494,15 +1500,59 @@ def : Pat<(or (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
 def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
           (VXORI_B LSX128:$vj, uimm8:$imm)>;
 
+// VBSLL_V
+def : Pat<(loongarch_vbsll v16i8:$vj, uimm5:$imm), (VBSLL_V v16i8:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v8i16:$vj, uimm5:$imm), (VBSLL_V v8i16:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4i32:$vj, uimm5:$imm), (VBSLL_V v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2i64:$vj, uimm5:$imm), (VBSLL_V v2i64:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v4f32:$vj, uimm5:$imm), (VBSLL_V v4f32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsll v2f64:$vj, uimm5:$imm), (VBSLL_V v2f64:$vj,
+                                                       uimm5:$imm)>;
+
+// VBSRL_V
+def : Pat<(loongarch_vbsrl v16i8:$vj, uimm5:$imm), (VBSRL_V v16i8:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v8i16:$vj, uimm5:$imm), (VBSRL_V v8i16:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4i32:$vj, uimm5:$imm), (VBSRL_V v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2i64:$vj, uimm5:$imm), (VBSRL_V v2i64:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v4f32:$vj, uimm5:$imm), (VBSRL_V v4f32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vbsrl v2f64:$vj, uimm5:$imm), (VBSRL_V v2f64:$vj,
+                                                       uimm5:$imm)>;
+
 // VSLL[I]_{B/H/W/D}
 defm : PatVrVr<shl, "VSLL">;
 defm : PatShiftVrVr<shl, "VSLL">;
 defm : PatShiftVrUimm<shl, "VSLLI">;
+def : Pat<(loongarch_vslli v16i8:$vj, uimm3:$imm), (VSLLI_B v16i8:$vj,
+                                                       uimm3:$imm)>;
+def : Pat<(loongarch_vslli v8i16:$vj, uimm4:$imm), (VSLLI_H v8i16:$vj,
+                                                       uimm4:$imm)>;
+def : Pat<(loongarch_vslli v4i32:$vj, uimm5:$imm), (VSLLI_W v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vslli v2i64:$vj, uimm6:$imm), (VSLLI_D v2i64:$vj,
+                                                       uimm6:$imm)>;
 
 // VSRL[I]_{B/H/W/D}
 defm : PatVrVr<srl, "VSRL">;
 defm : PatShiftVrVr<srl, "VSRL">;
 defm : PatShiftVrUimm<srl, "VSRLI">;
+def : Pat<(loongarch_vsrli v16i8:$vj, uimm3:$imm), (VSRLI_B v16i8:$vj,
+                                                       uimm3:$imm)>;
+def : Pat<(loongarch_vsrli v8i16:$vj, uimm4:$imm), (VSRLI_H v8i16:$vj,
+                                                       uimm4:$imm)>;
+def : Pat<(loongarch_vsrli v4i32:$vj, uimm5:$imm), (VSRLI_W v4i32:$vj,
+                                                       uimm5:$imm)>;
+def : Pat<(loongarch_vsrli v2i64:$vj, uimm6:$imm), (VSRLI_D v2i64:$vj,
+                                                       uimm6:$imm)>;
 
 // VSRA[I]_{B/H/W/D}
 defm : PatVrVr<sra, "VSRA">;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index 984b6f3d74866..d84e408cd28be 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,11 +374,8 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI24_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 32
+; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i32>, ptr %src
   %e = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
index b590103511847..48f18a35a38c4 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-bit-shift.ll
@@ -4,10 +4,7 @@
 define <16 x i8> @shuffle_to_vslli_h_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_h_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
   ret <16 x i8> %shuffle
@@ -16,10 +13,7 @@ define <16 x i8> @shuffle_to_vslli_h_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_h_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_h_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.h $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -28,10 +22,7 @@ define <16 x i8> @shuffle_to_vsrli_h_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_w_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_w_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 1, i32 2, i32 16, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 16, i32 12, i32 13, i32 14>
   ret <16 x i8> %shuffle
@@ -40,10 +31,7 @@ define <16 x i8> @shuffle_to_vslli_w_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_w_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_w_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 16, i32 13, i32 14, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -52,11 +40,7 @@ define <16 x i8> @shuffle_to_vsrli_w_8(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_to_vslli_w_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_w_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
   ret <8 x i16> %shuffle
@@ -65,11 +49,7 @@ define <8 x i16> @shuffle_to_vslli_w_16(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_to_vsrli_w_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_w_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 16
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 5, i32 8, i32 7, i32 8>
   ret <8 x i16> %shuffle
@@ -78,10 +58,7 @@ define <8 x i16> @shuffle_to_vsrli_w_16(<8 x i16> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_w_24(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_w_24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
   ret <16 x i8> %shuffle
@@ -90,10 +67,7 @@ define <16 x i8> @shuffle_to_vslli_w_24(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_w_24(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_w_24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 24
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 3, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -102,10 +76,7 @@ define <16 x i8> @shuffle_to_vsrli_w_24(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_d_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <16 x i8> %shuffle
@@ -114,10 +85,7 @@ define <16 x i8> @shuffle_to_vslli_d_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_d_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -126,11 +94,7 @@ define <16 x i8> @shuffle_to_vsrli_d_8(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_to_vslli_d_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 16
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6>
   ret <8 x i16> %shuffle
@@ -139,11 +103,7 @@ define <8 x i16> @shuffle_to_vslli_d_16(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_to_vsrli_d_16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 16
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 8>
   ret <8 x i16> %shuffle
@@ -152,10 +112,7 @@ define <8 x i16> @shuffle_to_vsrli_d_16(<8 x i16> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_d_24(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI12_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 24
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 16, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12>
   ret <16 x i8> %shuffle
@@ -164,10 +121,7 @@ define <16 x i8> @shuffle_to_vslli_d_24(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_d_24(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_24:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI13_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 24
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -176,11 +130,7 @@ define <16 x i8> @shuffle_to_vsrli_d_24(<16 x i8> %a) nounwind {
 define <4 x i32> @shuffle_to_vslli_d_32(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI14_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 poison>
   ret <4 x i32> %shuffle
@@ -189,11 +139,7 @@ define <4 x i32> @shuffle_to_vslli_d_32(<4 x i32> %a) nounwind {
 define <4 x i32> @shuffle_to_vsrli_d_32(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI15_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 32
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
   ret <4 x i32> %shuffle
@@ -202,10 +148,7 @@ define <4 x i32> @shuffle_to_vsrli_d_32(<4 x i32> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_d_40(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_40:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI16_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 40
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8, i32 9, i32 10>
   ret <16 x i8> %shuffle
@@ -214,10 +157,7 @@ define <16 x i8> @shuffle_to_vslli_d_40(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_d_40(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_40:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI17_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 40
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -226,11 +166,7 @@ define <16 x i8> @shuffle_to_vsrli_d_40(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_to_vslli_d_48(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_48:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI18_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 8, i32 0, i32 8, i32 8, i32 8, i32 4>
   ret <8 x i16> %shuffle
@@ -239,11 +175,7 @@ define <8 x i16> @shuffle_to_vslli_d_48(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_to_vsrli_d_48(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_48:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI19_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 48
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 3, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -252,10 +184,7 @@ define <8 x i16> @shuffle_to_vsrli_d_48(<8 x i16> %a) nounwind {
 define <16 x i8> @shuffle_to_vslli_d_56(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vslli_d_56:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI20_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
   ret <16 x i8> %shuffle
@@ -264,10 +193,7 @@ define <16 x i8> @shuffle_to_vslli_d_56(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_to_vsrli_d_56(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_to_vsrli_d_56:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI21_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 56
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll
index 8156239f81963..720fe919601e6 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll
@@ -4,10 +4,7 @@
 define <16 x i8> @shuffle_16i8_vbsll_v_1(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <16 x i8> %shuffle
@@ -16,10 +13,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_1(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_2(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 2
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
   ret <16 x i8> %shuffle
@@ -28,10 +22,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_2(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_3(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 3
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
   ret <16 x i8> %shuffle
@@ -40,10 +31,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_3(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_4(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 4
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <16 x i8> %shuffle
@@ -52,10 +40,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_4(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_5(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 5
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <16 x i8> %shuffle
@@ -64,10 +49,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_5(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_6(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 6
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <16 x i8> %shuffle
@@ -76,10 +58,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_6(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_7(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 7
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   ret <16 x i8> %shuffle
@@ -88,10 +67,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_7(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <16 x i8> %shuffle
@@ -100,10 +76,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_9(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 9
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
   ret <16 x i8> %shuffle
@@ -112,10 +85,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_9(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_10(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 10
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <16 x i8> %shuffle
@@ -124,10 +94,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_10(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_11(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 11
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4>
   ret <16 x i8> %shuffle
@@ -136,10 +103,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_11(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_12(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 12
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3>
   ret <16 x i8> %shuffle
@@ -148,10 +112,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_12(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_13(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI12_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 13
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2>
   ret <16 x i8> %shuffle
@@ -160,10 +121,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_13(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_14(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI13_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 14
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1>
   ret <16 x i8> %shuffle
@@ -172,10 +130,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_14(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsll_v_15(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsll_v_15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI14_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 15
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0>
   ret <16 x i8> %shuffle
@@ -184,11 +139,7 @@ define <16 x i8> @shuffle_16i8_vbsll_v_15(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_2(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI15_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 2
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
   ret <8 x i16> %shuffle
@@ -197,11 +148,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_2(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_4(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI16_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 4
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i16> %shuffle
@@ -210,11 +157,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_4(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_6(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI17_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 6
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
   ret <8 x i16> %shuffle
@@ -223,11 +166,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_6(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_8(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI18_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i16> %shuffle
@@ -236,11 +175,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_8(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_10(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI19_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 10
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2>
   ret <8 x i16> %shuffle
@@ -249,11 +184,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_10(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_12(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI20_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 12
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 1>
   ret <8 x i16> %shuffle
@@ -262,11 +193,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_12(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsll_v_14(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsll_v_14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI21_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 14
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0>
   ret <8 x i16> %shuffle
@@ -275,11 +202,7 @@ define <8 x i16> @shuffle_8i16_vbsll_v_14(<8 x i16> %a) nounwind {
 define <4 x i32> @shuffle_4i32_vbsll_v_4(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_4i32_vbsll_v_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI22_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 4
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i32> %shuffle
@@ -288,11 +211,7 @@ define <4 x i32> @shuffle_4i32_vbsll_v_4(<4 x i32> %a) nounwind {
 define <4 x i32> @shuffle_4i32_vbsll_v_8(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_4i32_vbsll_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI23_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 4, i32 0, i32 1>
   ret <4 x i32> %shuffle
@@ -301,11 +220,7 @@ define <4 x i32> @shuffle_4i32_vbsll_v_8(<4 x i32> %a) nounwind {
 define <4 x i32> @shuffle_4i32_vbsll_v_12(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_4i32_vbsll_v_12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI24_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsll.v $vr0, $vr0, 12
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 4, i32 4, i32 0>
   ret <4 x i32> %shuffle
@@ -324,10 +239,7 @@ define <2 x i64> @shuffle_2i64_vbsll_v_8(<2 x i64> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_1(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI26_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
   ret <16 x i8> %shuffle
@@ -336,10 +248,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_1(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_2(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI27_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 2
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -348,10 +257,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_2(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_3(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI28_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 3
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -360,10 +266,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_3(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_4(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI29_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 4
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -372,10 +275,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_4(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_5(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI30_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 5
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -384,10 +284,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_5(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_6(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI31_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 6
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -396,10 +293,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_6(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_7(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI32_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 7
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -408,10 +302,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_7(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI33_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -420,10 +311,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_8(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_9(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI34_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 9
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -432,10 +320,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_9(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_10(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI35_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 10
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -444,10 +329,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_10(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_11(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI36_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 11
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -456,10 +338,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_11(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_12(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI37_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 12
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -468,10 +347,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_12(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_13(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI38_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 13
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -480,10 +356,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_13(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_14(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI39_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 14
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -492,10 +365,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_14(<16 x i8> %a) nounwind {
 define <16 x i8> @shuffle_16i8_vbsrl_v_15(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: shuffle_16i8_vbsrl_v_15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI40_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr2, $vr0, $vr1
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 15
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   ret <16 x i8> %shuffle
@@ -504,11 +374,7 @@ define <16 x i8> @shuffle_16i8_vbsrl_v_15(<16 x i8> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_2(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI41_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 2
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   ret <8 x i16> %shuffle
@@ -517,11 +383,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_2(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_4(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI42_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 4
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -530,11 +392,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_4(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_6(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI43_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 6
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -543,11 +401,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_6(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_8(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI44_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -556,11 +410,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_8(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_10(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI45_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 10
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -569,11 +419,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_10(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_12(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI46_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 12
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -582,11 +428,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_12(<8 x i16> %a) nounwind {
 define <8 x i16> @shuffle_8i16_vbsrl_v_14(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: shuffle_8i16_vbsrl_v_14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI47_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 14
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <8 x i16> %shuffle
@@ -595,11 +437,7 @@ define <8 x i16> @shuffle_8i16_vbsrl_v_14(<8 x i16> %a) nounwind {
 define <4 x i32> @shuffle_4i32_vbsrl_v_4(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_4i32_vbsrl_v_4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI48_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 4
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %shuffle
@@ -608,11 +446,7 @@ define <4 x i32> @shuffle_4i32_vbsrl_v_4(<4 x i32> %a) nounwind {
 define <4 x i32> @shuffle_4i32_vbsrl_v_8(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_4i32_vbsrl_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI49_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
   ret <4 x i32> %shuffle
@@ -621,11 +455,7 @@ define <4 x i32> @shuffle_4i32_vbsrl_v_8(<4 x i32> %a) nounwind {
 define <4 x i32> @shuffle_4i32_vbsrl_v_12(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: shuffle_4i32_vbsrl_v_12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI50_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 12
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
   ret <4 x i32> %shuffle
@@ -634,11 +464,7 @@ define <4 x i32> @shuffle_4i32_vbsrl_v_12(<4 x i32> %a) nounwind {
 define <2 x i64> @shuffle_2i64_vbsrl_v_8(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: shuffle_2i64_vbsrl_v_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI51_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.d $vr1, $vr2, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 8
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %shuffle

>From 0567c306aba84c439451f8a7b6425d862c1ffbca Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Tue, 25 Mar 2025 10:41:58 +0800
Subject: [PATCH 2/3] small change

---
 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 772e936ac19e9..255f9e1796ecf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -531,7 +531,7 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
 // left or right with zeroable elements 'shifted in'. It handles both the
 // strictly bit-wise element shifts and the byte shfit across an entire 128-bit
 // lane.
-// This is mainly copy from X86.
+// Mostly copied from X86.
 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
                                int MaskOffset, const APInt &Zeroable) {

>From ded9f43c157c1807a0b35209078152d6fe18caff Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Wed, 2 Apr 2025 17:04:55 +0800
Subject: [PATCH 3/3] refactor td code

---
 .../LoongArch/LoongArchISelLowering.cpp       |  2 +-
 .../Target/LoongArch/LoongArchLSXInstrInfo.td | 67 +++++++------------
 2 files changed, 24 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 255f9e1796ecf..89bcbd394c107 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -527,7 +527,7 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
 
 /// Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI
 /// instruction.
-// The funciton matches elements form one of the input vector shuffled to the
+// The funciton matches elements from one of the input vector shuffled to the
 // left or right with zeroable elements 'shifted in'. It handles both the
 // strictly bit-wise element shifts and the byte shfit across an entire 128-bit
 // lane.
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d22b474692bda..b0d880749bf92 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1352,7 +1352,7 @@ multiclass PatShiftVrVr<SDPatternOperator OpNode, string Inst> {
             (!cast<LAInst>(Inst#"_D") LSX128:$vj, LSX128:$vk)>;
 }
 
-multiclass PatShiftVrUimm<SDPatternOperator OpNode, string Inst> {
+multiclass PatShiftVrSplatUimm<SDPatternOperator OpNode, string Inst> {
   def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm3 uimm3:$imm))),
             (!cast<LAInst>(Inst#"_B") LSX128:$vj, uimm3:$imm)>;
   def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 (SplatPat_uimm4 uimm4:$imm))),
@@ -1363,6 +1363,17 @@ multiclass PatShiftVrUimm<SDPatternOperator OpNode, string Inst> {
             (!cast<LAInst>(Inst#"_D") LSX128:$vj, uimm6:$imm)>;
 }
 
+multiclass PatShiftVrUimm<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode(v16i8 LSX128:$vj), uimm3:$imm),
+            (!cast<LAInst>(Inst#"_B") LSX128:$vj, uimm3:$imm)>;
+  def : Pat<(OpNode(v8i16 LSX128:$vj), uimm4:$imm),
+            (!cast<LAInst>(Inst#"_H") LSX128:$vj, uimm4:$imm)>;
+  def : Pat<(OpNode(v4i32 LSX128:$vj), uimm5:$imm),
+            (!cast<LAInst>(Inst#"_W") LSX128:$vj, uimm5:$imm)>;
+  def : Pat<(OpNode(v2i64 LSX128:$vj), uimm6:$imm),
+            (!cast<LAInst>(Inst#"_D") LSX128:$vj, uimm6:$imm)>;
+}
+
 multiclass PatCCVrSimm5<CondCode CC, string Inst> {
   def : Pat<(v16i8 (setcc (v16i8 LSX128:$vj),
                           (v16i8 (SplatPat_simm5 simm5:$imm)), CC)),
@@ -1501,63 +1512,31 @@ def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
           (VXORI_B LSX128:$vj, uimm8:$imm)>;
 
 // VBSLL_V
-def : Pat<(loongarch_vbsll v16i8:$vj, uimm5:$imm), (VBSLL_V v16i8:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsll v8i16:$vj, uimm5:$imm), (VBSLL_V v8i16:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsll v4i32:$vj, uimm5:$imm), (VBSLL_V v4i32:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsll v2i64:$vj, uimm5:$imm), (VBSLL_V v2i64:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsll v4f32:$vj, uimm5:$imm), (VBSLL_V v4f32:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsll v2f64:$vj, uimm5:$imm), (VBSLL_V v2f64:$vj,
-                                                       uimm5:$imm)>;
+foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32,
+              v2f64] in def : Pat<(loongarch_vbsll(vt LSX128:$vj), uimm5:$imm),
+                                  (VBSLL_V LSX128:$vj, uimm5:$imm)>;
 
 // VBSRL_V
-def : Pat<(loongarch_vbsrl v16i8:$vj, uimm5:$imm), (VBSRL_V v16i8:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsrl v8i16:$vj, uimm5:$imm), (VBSRL_V v8i16:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsrl v4i32:$vj, uimm5:$imm), (VBSRL_V v4i32:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsrl v2i64:$vj, uimm5:$imm), (VBSRL_V v2i64:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsrl v4f32:$vj, uimm5:$imm), (VBSRL_V v4f32:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vbsrl v2f64:$vj, uimm5:$imm), (VBSRL_V v2f64:$vj,
-                                                       uimm5:$imm)>;
+foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32,
+              v2f64] in def : Pat<(loongarch_vbsrl(vt LSX128:$vj), uimm5:$imm),
+                                  (VBSRL_V LSX128:$vj, uimm5:$imm)>;
 
 // VSLL[I]_{B/H/W/D}
 defm : PatVrVr<shl, "VSLL">;
 defm : PatShiftVrVr<shl, "VSLL">;
-defm : PatShiftVrUimm<shl, "VSLLI">;
-def : Pat<(loongarch_vslli v16i8:$vj, uimm3:$imm), (VSLLI_B v16i8:$vj,
-                                                       uimm3:$imm)>;
-def : Pat<(loongarch_vslli v8i16:$vj, uimm4:$imm), (VSLLI_H v8i16:$vj,
-                                                       uimm4:$imm)>;
-def : Pat<(loongarch_vslli v4i32:$vj, uimm5:$imm), (VSLLI_W v4i32:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vslli v2i64:$vj, uimm6:$imm), (VSLLI_D v2i64:$vj,
-                                                       uimm6:$imm)>;
+defm : PatShiftVrSplatUimm<shl, "VSLLI">;
+defm : PatShiftVrUimm<loongarch_vslli, "VSLLI">;
 
 // VSRL[I]_{B/H/W/D}
 defm : PatVrVr<srl, "VSRL">;
 defm : PatShiftVrVr<srl, "VSRL">;
-defm : PatShiftVrUimm<srl, "VSRLI">;
-def : Pat<(loongarch_vsrli v16i8:$vj, uimm3:$imm), (VSRLI_B v16i8:$vj,
-                                                       uimm3:$imm)>;
-def : Pat<(loongarch_vsrli v8i16:$vj, uimm4:$imm), (VSRLI_H v8i16:$vj,
-                                                       uimm4:$imm)>;
-def : Pat<(loongarch_vsrli v4i32:$vj, uimm5:$imm), (VSRLI_W v4i32:$vj,
-                                                       uimm5:$imm)>;
-def : Pat<(loongarch_vsrli v2i64:$vj, uimm6:$imm), (VSRLI_D v2i64:$vj,
-                                                       uimm6:$imm)>;
+defm : PatShiftVrSplatUimm<srl, "VSRLI">;
+defm : PatShiftVrUimm<loongarch_vsrli, "VSRLI">;
 
 // VSRA[I]_{B/H/W/D}
 defm : PatVrVr<sra, "VSRA">;
 defm : PatShiftVrVr<sra, "VSRA">;
-defm : PatShiftVrUimm<sra, "VSRAI">;
+defm : PatShiftVrSplatUimm<sra, "VSRAI">;
 
 // VCLZ_{B/H/W/D}
 defm : PatVr<ctlz, "VCLZ">;



More information about the llvm-commits mailing list