[llvm] e3bd1f2 - [LoongArch] lower vector shuffle to zero or any extend (#129485)

via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 23 23:29:14 PDT 2025


Author: tangaac
Date: 2025-03-24T14:29:11+08:00
New Revision: e3bd1f2b3f2ff8317e72fc828db8bcdbeb50b1aa

URL: https://github.com/llvm/llvm-project/commit/e3bd1f2b3f2ff8317e72fc828db8bcdbeb50b1aa
DIFF: https://github.com/llvm/llvm-project/commit/e3bd1f2b3f2ff8317e72fc828db8bcdbeb50b1aa.diff

LOG: [LoongArch] lower vector shuffle to zero or any extend (#129485)

Added: 
    

Modified: 
    llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
    llvm/lib/Target/LoongArch/LoongArchISelLowering.h
    llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
    llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
    llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
    llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
    llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
    llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index dceb3c682d2df..f69019bf34f37 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -542,6 +542,132 @@ fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
   return true;
 }
 
+/// Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, APInt &KnownUndef,
+                                           APInt &KnownZero) {
+  int Size = Mask.size();
+  KnownUndef = KnownZero = APInt::getZero(Size);
+
+  V1 = peekThroughBitcasts(V1);
+  V2 = peekThroughBitcasts(V2);
+
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  int VectorSizeInBits = V1.getValueSizeInBits();
+  int ScalarSizeInBits = VectorSizeInBits / Size;
+  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+  for (int i = 0; i < Size; ++i) {
+    int M = Mask[i];
+    if (M < 0) {
+      KnownUndef.setBit(i);
+      continue;
+    }
+    if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+      KnownZero.setBit(i);
+      continue;
+    }
+  }
+}
+
+/// Lower VECTOR_SHUFFLE as ZERO_EXTEND Or ANY_EXTEND (if possible).
+///
+/// For example:
+///   %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+///                      <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+///   %3 = bitcast <4 x i32> %2 to <2 x i64>
+/// is lowered to:
+///     (VREPLI $v1, 0)
+///     (VILVL $v0, $v1, $v0)
+static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
+                                                    ArrayRef<int> Mask, MVT VT,
+                                                    SDValue V1, SDValue V2,
+                                                    SelectionDAG &DAG) {
+  int Bits = VT.getSizeInBits();
+  int EltBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+
+  APInt KnownUndef, KnownZero;
+  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+  APInt Zeroable = KnownUndef | KnownZero;
+  if (Zeroable.isAllOnes())
+    return DAG.getConstant(0, DL, VT);
+
+  // Define a helper function to check a particular ext-scale and lower to it if
+  // valid.
+  auto Lower = [&](int Scale) -> SDValue {
+    SDValue InputV;
+    bool AnyExt = true;
+    int Offset = 0;
+    for (int i = 0; i < NumElements; i++) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      if (i % Scale != 0) {
+        // Each of the extended elements need to be zeroable.
+        if (!Zeroable[i])
+          return SDValue();
+
+        AnyExt = false;
+        continue;
+      }
+
+      // Each of the base elements needs to be consecutive indices into the
+      // same input vector.
+      SDValue V = M < NumElements ? V1 : V2;
+      M = M % NumElements;
+      if (!InputV) {
+        InputV = V;
+        Offset = M - (i / Scale);
+
+        // These offset can't be handled
+        if (Offset % (NumElements / Scale))
+          return SDValue();
+      } else if (InputV != V)
+        return SDValue();
+
+      if (M != (Offset + (i / Scale)))
+        return SDValue(); // Non-consecutive strided elements.
+    }
+
+    // If we fail to find an input, we have a zero-shuffle which should always
+    // have already been handled.
+    if (!InputV)
+      return SDValue();
+
+    do {
+      unsigned VilVLoHi = LoongArchISD::VILVL;
+      if (Offset >= (NumElements / 2)) {
+        VilVLoHi = LoongArchISD::VILVH;
+        Offset -= (NumElements / 2);
+      }
+
+      MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+      SDValue Ext =
+          AnyExt ? DAG.getFreeze(InputV) : DAG.getConstant(0, DL, InputVT);
+      InputV = DAG.getBitcast(InputVT, InputV);
+      InputV = DAG.getNode(VilVLoHi, DL, InputVT, Ext, InputV);
+      Scale /= 2;
+      EltBits *= 2;
+      NumElements /= 2;
+    } while (Scale > 1);
+    return DAG.getBitcast(VT, InputV);
+  };
+
+  // Each iteration, try extending the elements half as much, but into twice as
+  // many elements.
+  for (int NumExtElements = Bits / 64; NumExtElements < NumElements;
+       NumExtElements *= 2) {
+    if (SDValue V = Lower(NumElements / NumExtElements))
+      return V;
+  }
+  return SDValue();
+}
+
 /// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
 ///
 /// VREPLVEI performs vector broadcast based on an element specified by an
@@ -956,6 +1082,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
+  if ((Result =
+           lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
   if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
     return Result;
 

diff  --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f8d4cef76b955..002fad0e20759 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -271,7 +271,14 @@ class LoongArchTargetLowering : public TargetLowering {
       unsigned *Fast = nullptr) const override;
 
   bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override {
-    return false;
+    if (!VT.isSimple())
+      return false;
+
+    // Not for i1 vectors
+    if (VT.getSimpleVT().getScalarType() == MVT::i1)
+      return false;
+
+    return isTypeLegal(VT.getSimpleVT());
   }
   bool shouldConsiderGEPOffsetSplit() const override { return true; }
   bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;

diff  --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index eaab6524c5317..984b6f3d74866 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,10 +374,11 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
+; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI24_0)
+; CHECK-NEXT:    vrepli.b $vr2, 0
+; CHECK-NEXT:    vshuf.w $vr1, $vr2, $vr0
+; CHECK-NEXT:    vst $vr1, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i32>, ptr %src
   %e = extractelement <4 x i32> %v, i32 1

diff  --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
index 75639ae090661..9485df746ff1c 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
@@ -6,10 +6,10 @@ define void @load_sext_2i8_to_2i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_2i8_to_2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vsrai.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vst $vr0, $a1, 0
@@ -25,10 +25,9 @@ define void @load_sext_4i8_to_4i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_4i8_to_4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vsrai.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vst $vr0, $a1, 0
@@ -61,10 +60,9 @@ define void @load_sext_2i16_to_2i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_2i16_to_2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vshuf.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vsrai.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vst $vr0, $a1, 0
@@ -114,17 +112,14 @@ define void @load_sext_16i8_to_16i16(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_16i8_to_16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT:    vilvl.b $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvl.b $vr1, $vr0, $vr0
 ; CHECK-NEXT:    vslli.h $vr1, $vr1, 8
 ; CHECK-NEXT:    vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvh.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
 ; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr1, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <16 x i8>, ptr %ptr
@@ -137,33 +132,24 @@ define void @load_sext_16i8_to_16i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_16i8_to_16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI7_2)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr1, $vr2
+; CHECK-NEXT:    vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT:    vslli.w $vr2, $vr2, 24
+; CHECK-NEXT:    vsrai.w $vr2, $vr2, 24
+; CHECK-NEXT:    vilvh.h $vr1, $vr1, $vr1
 ; CHECK-NEXT:    vslli.w $vr1, $vr1, 24
 ; CHECK-NEXT:    vsrai.w $vr1, $vr1, 24
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_3)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI7_3)
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr3, $vr2
+; CHECK-NEXT:    vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr3, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr3, $vr3, 24
 ; CHECK-NEXT:    vsrai.w $vr3, $vr3, 24
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr4, $vr2
-; CHECK-NEXT:    vslli.w $vr4, $vr4, 24
-; CHECK-NEXT:    vsrai.w $vr4, $vr4, 24
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr2
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vsrai.w $vr0, $vr0, 24
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr4, $a1, 48
+; CHECK-NEXT:    vst $vr0, $a1, 48
 ; CHECK-NEXT:    vst $vr3, $a1, 32
 ; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <16 x i8>, ptr %ptr
@@ -176,59 +162,44 @@ define void @load_sext_16i8_to_16i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_16i8_to_16i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vshuf4i.b $vr2, $vr0, 14
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT:    vshuf.b $vr2, $vr0, $vr2, $vr1
-; CHECK-NEXT:    vslli.d $vr2, $vr2, 56
-; CHECK-NEXT:    vsrai.d $vr2, $vr2, 56
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr3, $vr1
+; CHECK-NEXT:    vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT:    vilvl.w $vr3, $vr2, $vr2
 ; CHECK-NEXT:    vslli.d $vr3, $vr3, 56
 ; CHECK-NEXT:    vsrai.d $vr3, $vr3, 56
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr4, $vr1
+; CHECK-NEXT:    vilvh.w $vr2, $vr2, $vr2
+; CHECK-NEXT:    vslli.d $vr2, $vr2, 56
+; CHECK-NEXT:    vsrai.d $vr2, $vr2, 56
+; CHECK-NEXT:    vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvl.w $vr4, $vr1, $vr1
 ; CHECK-NEXT:    vslli.d $vr4, $vr4, 56
 ; CHECK-NEXT:    vsrai.d $vr4, $vr4, 56
-; CHECK-NEXT:    vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_4)
-; CHECK-NEXT:    vld $vr6, $a0, %pc_lo12(.LCPI8_4)
-; CHECK-NEXT:    vshuf.b $vr5, $vr0, $vr5, $vr1
-; CHECK-NEXT:    vslli.d $vr5, $vr5, 56
-; CHECK-NEXT:    vsrai.d $vr5, $vr5, 56
-; CHECK-NEXT:    vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_5)
-; CHECK-NEXT:    vld $vr7, $a0, %pc_lo12(.LCPI8_5)
-; CHECK-NEXT:    vshuf.b $vr6, $vr0, $vr6, $vr1
+; CHECK-NEXT:    vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT:    vslli.d $vr1, $vr1, 56
+; CHECK-NEXT:    vsrai.d $vr1, $vr1, 56
+; CHECK-NEXT:    vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr5, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr6, $vr5, $vr5
 ; CHECK-NEXT:    vslli.d $vr6, $vr6, 56
 ; CHECK-NEXT:    vsrai.d $vr6, $vr6, 56
-; CHECK-NEXT:    vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_6)
-; CHECK-NEXT:    vld $vr8, $a0, %pc_lo12(.LCPI8_6)
-; CHECK-NEXT:    vshuf.b $vr7, $vr0, $vr7, $vr1
+; CHECK-NEXT:    vilvh.w $vr5, $vr5, $vr5
+; CHECK-NEXT:    vslli.d $vr5, $vr5, 56
+; CHECK-NEXT:    vsrai.d $vr5, $vr5, 56
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr7, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr7, $vr7, 56
 ; CHECK-NEXT:    vsrai.d $vr7, $vr7, 56
-; CHECK-NEXT:    vshuf.b $vr8, $vr0, $vr0, $vr8
-; CHECK-NEXT:    vshuf.b $vr8, $vr0, $vr8, $vr1
-; CHECK-NEXT:    vslli.d $vr8, $vr8, 56
-; CHECK-NEXT:    vsrai.d $vr8, $vr8, 56
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vilvh.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vsrai.d $vr0, $vr0, 56
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr8, $a1, 112
+; CHECK-NEXT:    vst $vr0, $a1, 112
 ; CHECK-NEXT:    vst $vr7, $a1, 96
-; CHECK-NEXT:    vst $vr6, $a1, 80
-; CHECK-NEXT:    vst $vr5, $a1, 64
-; CHECK-NEXT:    vst $vr4, $a1, 48
-; CHECK-NEXT:    vst $vr3, $a1, 32
+; CHECK-NEXT:    vst $vr5, $a1, 80
+; CHECK-NEXT:    vst $vr6, $a1, 64
+; CHECK-NEXT:    vst $vr1, $a1, 48
+; CHECK-NEXT:    vst $vr4, $a1, 32
 ; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vst $vr3, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <16 x i8>, ptr %ptr
@@ -241,17 +212,14 @@ define void @load_sext_8i16_to_8i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_8i16_to_8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT:    vilvl.h $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvl.h $vr1, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr1, $vr1, 16
 ; CHECK-NEXT:    vsrai.w $vr1, $vr1, 16
-; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr1, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <8 x i16>, ptr %ptr
@@ -264,34 +232,24 @@ define void @load_sext_8i16_to_8i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_8i16_to_8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    vshuf4i.h $vr2, $vr0, 14
-; CHECK-NEXT:    vori.b $vr3, $vr1, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT:    vshuf.h $vr3, $vr0, $vr2
-; CHECK-NEXT:    vslli.d $vr2, $vr3, 48
+; CHECK-NEXT:    vilvl.h $vr1, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr2, $vr1, $vr1
+; CHECK-NEXT:    vslli.d $vr2, $vr2, 48
 ; CHECK-NEXT:    vsrai.d $vr2, $vr2, 48
-; CHECK-NEXT:    vshuf.h $vr4, $vr0, $vr0
-; CHECK-NEXT:    vori.b $vr3, $vr1, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT:    vshuf.h $vr3, $vr0, $vr4
+; CHECK-NEXT:    vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT:    vslli.d $vr1, $vr1, 48
+; CHECK-NEXT:    vsrai.d $vr1, $vr1, 48
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr3, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr3, $vr3, 48
 ; CHECK-NEXT:    vsrai.d $vr3, $vr3, 48
-; CHECK-NEXT:    vshuf.h $vr5, $vr0, $vr0
-; CHECK-NEXT:    vori.b $vr4, $vr1, 0
-; CHECK-NEXT:    vshuf.h $vr4, $vr0, $vr5
-; CHECK-NEXT:    vslli.d $vr4, $vr4, 48
-; CHECK-NEXT:    vsrai.d $vr4, $vr4, 48
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT:    vslli.d $vr0, $vr1, 48
+; CHECK-NEXT:    vilvh.w $vr0, $vr0, $vr0
+; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vsrai.d $vr0, $vr0, 48
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr4, $a1, 48
+; CHECK-NEXT:    vst $vr0, $a1, 48
 ; CHECK-NEXT:    vst $vr3, $a1, 32
-; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <8 x i16>, ptr %ptr
@@ -304,15 +262,14 @@ define void @load_sext_4i32_to_4i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_sext_4i32_to_4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr1, 16
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 16
 ; CHECK-NEXT:    vslli.d $vr1, $vr1, 32
 ; CHECK-NEXT:    vsrai.d $vr1, $vr1, 32
-; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 16
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 50
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
 ; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr1, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <4 x i32>, ptr %ptr

diff  --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
index e95a69775e3aa..9b1b584bd9c76 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
@@ -5,10 +5,10 @@ define void @shuffle_any_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_any_ext_2i8_to_2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <2 x i8>, ptr %ptr
@@ -22,10 +22,9 @@ define void @shuffle_any_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_any_ext_2i16_to_2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vshuf.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <2 x i16>, ptr %ptr
@@ -54,10 +53,9 @@ define void @shuffle_any_ext_4i8_to_4i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_any_ext_4i8_to_4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <4 x i8>, ptr %ptr
@@ -133,22 +131,16 @@ define void @shuffle_any_ext_8i16_to_8i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_any_ext_8i16_to_8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT:    vshuf.h $vr2, $vr0, $vr0
-; CHECK-NEXT:    vshuf.h $vr3, $vr0, $vr0
-; CHECK-NEXT:    vshuf.h $vr4, $vr0, $vr0
-; CHECK-NEXT:    vst $vr4, $a1, 48
+; CHECK-NEXT:    vilvl.h $vr1, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr2, $vr1, $vr1
+; CHECK-NEXT:    vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr3, $vr0, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 48
 ; CHECK-NEXT:    vst $vr3, $a1, 32
-; CHECK-NEXT:    vst $vr2, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <8 x i16>, ptr %ptr
   %y = shufflevector <8 x i16> %x, <8 x i16> poison, <32 x i32> <i32 0, i32 15, i32 15, i32 15, i32 1, i32 14, i32 14, i32 14, i32 2, i32 13, i32 13, i32 13, i32 3, i32 12, i32 12, i32 12, i32 4, i32 11, i32 11, i32 11, i32 5, i32 10, i32 10, i32 10, i32 6, i32 9, i32 9, i32 9, i32 7, i32 8, i32 8, i32 8>
@@ -177,22 +169,16 @@ define void @shuffle_any_ext_16i8_to_16i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_any_ext_16i8_to_16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr4
+; CHECK-NEXT:    vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT:    vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr3, $vr0, $vr0
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 48
 ; CHECK-NEXT:    vst $vr3, $a1, 32
-; CHECK-NEXT:    vst $vr2, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <16 x i8>, ptr %ptr
   %y = shufflevector <16 x i8> %x, <16 x i8> poison, <64 x i32> <i32 0, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16>
@@ -205,38 +191,28 @@ define void @shuffle_any_ext_16i8_to_16i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_any_ext_16i8_to_16i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI11_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_3)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI11_3)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_4)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI11_4)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_5)
-; CHECK-NEXT:    vld $vr6, $a0, %pc_lo12(.LCPI11_5)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_6)
-; CHECK-NEXT:    vld $vr7, $a0, %pc_lo12(.LCPI11_6)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_7)
-; CHECK-NEXT:    vld $vr8, $a0, %pc_lo12(.LCPI11_7)
-; CHECK-NEXT:    vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT:    vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT:    vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr8
+; CHECK-NEXT:    vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT:    vilvl.w $vr3, $vr2, $vr2
+; CHECK-NEXT:    vilvh.w $vr2, $vr2, $vr2
+; CHECK-NEXT:    vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvl.w $vr4, $vr1, $vr1
+; CHECK-NEXT:    vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT:    vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr5, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr6, $vr5, $vr5
+; CHECK-NEXT:    vilvh.w $vr5, $vr5, $vr5
+; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.w $vr7, $vr0, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 112
 ; CHECK-NEXT:    vst $vr7, $a1, 96
-; CHECK-NEXT:    vst $vr6, $a1, 80
-; CHECK-NEXT:    vst $vr5, $a1, 64
-; CHECK-NEXT:    vst $vr4, $a1, 48
-; CHECK-NEXT:    vst $vr3, $a1, 32
+; CHECK-NEXT:    vst $vr5, $a1, 80
+; CHECK-NEXT:    vst $vr6, $a1, 64
+; CHECK-NEXT:    vst $vr1, $a1, 48
+; CHECK-NEXT:    vst $vr4, $a1, 32
 ; CHECK-NEXT:    vst $vr2, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr3, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <16 x i8>, ptr %ptr
   %y = shufflevector <16 x i8> %x, <16 x i8> poison, <128 x i32> <i32 0, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>

diff  --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
index b1ad16a35251b..7e9f5b653d01a 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
@@ -5,11 +5,11 @@ define void @shuffle_sign_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_2i8_to_2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <2 x i8>, ptr %ptr
@@ -23,11 +23,10 @@ define void @shuffle_sign_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_2i16_to_2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <2 x i16>, ptr %ptr
@@ -41,11 +40,9 @@ define void @shuffle_sign_ext_2i32_to_2i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_2i32_to_2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr0, $vr1, $vr2
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <2 x i32>, ptr %ptr
@@ -59,11 +56,10 @@ define void @shuffle_sign_ext_4i8_to_4i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_4i8_to_4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <4 x i8>, ptr %ptr
@@ -77,11 +73,9 @@ define void @shuffle_sign_ext_4i16_to_4i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_4i16_to_4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, ptr %ptr
@@ -95,11 +89,9 @@ define void @shuffle_sign_ext_8i8_to_8i16(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_8i8_to_8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, ptr %ptr
@@ -113,15 +105,11 @@ define void @shuffle_sign_ext_4i32_to_4i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_4i32_to_4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr0, $vr3
-; CHECK-NEXT:    vshuf.w $vr2, $vr0, $vr3
-; CHECK-NEXT:    vst $vr2, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <4 x i32>, ptr %ptr
   %y = shufflevector <4 x i32> %x, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
@@ -134,15 +122,11 @@ define void @shuffle_sign_ext_8i16_to_8i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_8i16_to_8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr3
-; CHECK-NEXT:    vshuf.h $vr2, $vr0, $vr3
-; CHECK-NEXT:    vst $vr2, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <8 x i16>, ptr %ptr
   %y = shufflevector <8 x i16> %x, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 15, i32 1, i32 14, i32 2, i32 13, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>
@@ -155,23 +139,17 @@ define void @shuffle_sign_ext_8i16_to_8i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_8i16_to_8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr2
-; CHECK-NEXT:    vshuf.h $vr3, $vr0, $vr2
-; CHECK-NEXT:    vshuf.h $vr4, $vr0, $vr2
-; CHECK-NEXT:    vshuf.h $vr5, $vr0, $vr2
-; CHECK-NEXT:    vst $vr5, $a1, 48
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr3, $vr1, $vr2
+; CHECK-NEXT:    vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr4, $vr1, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 48
 ; CHECK-NEXT:    vst $vr4, $a1, 32
-; CHECK-NEXT:    vst $vr3, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vst $vr3, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <8 x i16>, ptr %ptr
   %y = shufflevector <8 x i16> %x, <8 x i16> zeroinitializer, <32 x i32> <i32 0, i32 15, i32 15, i32 15, i32 1, i32 14, i32 14, i32 14, i32 2, i32 13, i32 13, i32 13, i32 3, i32 12, i32 12, i32 12, i32 4, i32 11, i32 11, i32 11, i32 5, i32 10, i32 10, i32 10, i32 6, i32 9, i32 9, i32 9, i32 7, i32 8, i32 8, i32 8>
@@ -184,15 +162,11 @@ define void @shuffle_sign_ext_16i8_to_16i16(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_16i8_to_16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI9_1)
-; CHECK-NEXT:    vrepli.b $vr3, 0
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr3, $vr1
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr3, $vr2
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <16 x i8>, ptr %ptr
   %y = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 31, i32 1, i32 30, i32 2, i32 29, i32 3, i32 28, i32 4, i32 27, i32 5, i32 26, i32 6, i32 25, i32 7, i32 24, i32 8, i32 23, i32 9, i32 22, i32 10, i32 21, i32 11, i32 20, i32 12, i32 19, i32 13, i32 18, i32 14, i32 17, i32 15, i32 16 >
@@ -205,23 +179,17 @@ define void @shuffle_sign_ext_16i8_to_16i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_16i8_to_16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr2, $vr1
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr2, $vr3
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr2, $vr4
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr2, $vr5
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT:    vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr4, $vr1, $vr0
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 48
 ; CHECK-NEXT:    vst $vr4, $a1, 32
-; CHECK-NEXT:    vst $vr3, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vst $vr3, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <16 x i8>, ptr %ptr
   %y = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <64 x i32> <i32 0, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16>
@@ -234,39 +202,29 @@ define void @shuffle_sign_ext_16i8_to_16i64(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: shuffle_sign_ext_16i8_to_16i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_2)
-; CHECK-NEXT:    vld $vr4, $a0, %pc_lo12(.LCPI11_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_3)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI11_3)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr2, $vr1
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr2, $vr3
-; CHECK-NEXT:    vshuf.b $vr4, $vr0, $vr2, $vr4
-; CHECK-NEXT:    vshuf.b $vr5, $vr0, $vr2, $vr5
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_4)
-; CHECK-NEXT:    vld $vr6, $a0, %pc_lo12(.LCPI11_4)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_5)
-; CHECK-NEXT:    vld $vr7, $a0, %pc_lo12(.LCPI11_5)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_6)
-; CHECK-NEXT:    vld $vr8, $a0, %pc_lo12(.LCPI11_6)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_7)
-; CHECK-NEXT:    vld $vr9, $a0, %pc_lo12(.LCPI11_7)
-; CHECK-NEXT:    vshuf.b $vr6, $vr0, $vr2, $vr6
-; CHECK-NEXT:    vshuf.b $vr7, $vr0, $vr2, $vr7
-; CHECK-NEXT:    vshuf.b $vr8, $vr0, $vr2, $vr8
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr2, $vr9
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT:    vilvl.w $vr4, $vr1, $vr3
+; CHECK-NEXT:    vilvh.w $vr3, $vr1, $vr3
+; CHECK-NEXT:    vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvl.w $vr5, $vr1, $vr2
+; CHECK-NEXT:    vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr6, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr7, $vr1, $vr6
+; CHECK-NEXT:    vilvh.w $vr6, $vr1, $vr6
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr8, $vr1, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 112
 ; CHECK-NEXT:    vst $vr8, $a1, 96
-; CHECK-NEXT:    vst $vr7, $a1, 80
-; CHECK-NEXT:    vst $vr6, $a1, 64
-; CHECK-NEXT:    vst $vr5, $a1, 48
-; CHECK-NEXT:    vst $vr4, $a1, 32
+; CHECK-NEXT:    vst $vr6, $a1, 80
+; CHECK-NEXT:    vst $vr7, $a1, 64
+; CHECK-NEXT:    vst $vr2, $a1, 48
+; CHECK-NEXT:    vst $vr5, $a1, 32
 ; CHECK-NEXT:    vst $vr3, $a1, 16
-; CHECK-NEXT:    vst $vr1, $a1, 0
+; CHECK-NEXT:    vst $vr4, $a1, 0
 ; CHECK-NEXT:    ret
   %x = load <16 x i8>, ptr %ptr
   %y = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <128 x i32> <i32 0, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>

diff  --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
index 85e89e84464ef..f5bff59a26698 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
@@ -4,15 +4,14 @@
 define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: load_trunc_2i64_to_2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
 ; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT:    st.w $a0, $sp, 4
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vpackev.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
-; CHECK-NEXT:    st.w $a0, $sp, 0
-; CHECK-NEXT:    ld.d $a0, $sp, 0
 ; CHECK-NEXT:    st.d $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ptr
   %trunc = trunc <2 x i64> %a to <2 x i32>
@@ -23,16 +22,14 @@ define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind {
 define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: load_trunc_2i64_to_2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT:    st.h $a0, $sp, 2
 ; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
-; CHECK-NEXT:    st.h $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vpackev.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ptr
   %trunc = trunc <2 x i64> %a to <2 x i16>
@@ -43,16 +40,14 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind {
 define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: load_trunc_2i64_to_2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT:    st.b $a0, $sp, 1
 ; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
-; CHECK-NEXT:    st.b $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 0
+; CHECK-NEXT:    vpackev.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ptr
   %trunc = trunc <2 x i64> %a to <2 x i8>
@@ -141,17 +136,15 @@ define void @load_trunc_8i16_to_8i8(ptr %ptr, ptr %dst) nounwind {
 define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: load_trunc_2i32_to_2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT:    st.h $a0, $sp, 2
 ; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 0
-; CHECK-NEXT:    st.h $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vpackev.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %a = load <2 x i32>, ptr %ptr
   %trunc = trunc <2 x i32> %a to <2 x i16>
@@ -162,17 +155,15 @@ define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind {
 define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: load_trunc_2i32_to_2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT:    st.b $a0, $sp, 1
 ; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 0
-; CHECK-NEXT:    st.b $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 0
+; CHECK-NEXT:    vpackev.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %a = load <2 x i32>, ptr %ptr
   %trunc = trunc <2 x i32> %a to <2 x i8>
@@ -208,17 +199,15 @@ define void @load_trunc_4i16_to_4i8(ptr %ptr, ptr %dst) nounwind {
 define void @load_trunc_2i16_to_2i8(ptr %ptr, ptr %dst) nounwind {
 ; CHECK-LABEL: load_trunc_2i16_to_2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 1
-; CHECK-NEXT:    st.b $a0, $sp, 1
 ; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 0
-; CHECK-NEXT:    st.b $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 1
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 0
+; CHECK-NEXT:    vpackev.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %a = load <2 x i16>, ptr %ptr
   %trunc = trunc <2 x i16> %a to <2 x i8>

diff  --git a/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
index 3cc9b62d76736..bbcfbe1b07260 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
@@ -6,11 +6,11 @@ define void @load_zext_2i8_to_2i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_2i8_to_2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -24,11 +24,10 @@ define void @load_zext_4i8_to_4i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_4i8_to_4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -42,11 +41,9 @@ define void @load_zext_8i8_to_8i16(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_8i8_to_8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -60,11 +57,10 @@ define void @load_zext_2i16_to_2i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_2i16_to_2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -78,11 +74,9 @@ define void @load_zext_4i16_to_4i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_4i16_to_4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -96,11 +90,9 @@ define void @load_zext_2i32_to_2i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_2i32_to_2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT:    vrepli.b $vr2, 0
-; CHECK-NEXT:    vshuf.w $vr0, $vr1, $vr2
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -114,18 +106,11 @@ define void @load_zext_16i8_to_16i16(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_16i8_to_16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI6_2)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT:    vrepli.b $vr4, 0
-; CHECK-NEXT:    vshuf.b $vr1, $vr4, $vr1, $vr2
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr4, $vr3
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <16 x i8>, ptr %ptr
@@ -138,28 +123,17 @@ define void @load_zext_16i8_to_16i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_16i8_to_16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI7_2)
-; CHECK-NEXT:    vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT:    vrepli.b $vr4, 0
-; CHECK-NEXT:    vshuf.b $vr1, $vr4, $vr1, $vr2
-; CHECK-NEXT:    vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_3)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI7_3)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_4)
-; CHECK-NEXT:    vld $vr6, $a0, %pc_lo12(.LCPI7_4)
-; CHECK-NEXT:    vshuf.b $vr3, $vr4, $vr3, $vr2
-; CHECK-NEXT:    vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT:    vshuf.b $vr2, $vr4, $vr5, $vr2
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr4, $vr6
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr2, $a1, 48
-; CHECK-NEXT:    vst $vr3, $a1, 32
-; CHECK-NEXT:    vst $vr1, $a1, 16
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT:    vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr4, $vr1, $vr0
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 48
+; CHECK-NEXT:    vst $vr4, $a1, 32
+; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vst $vr3, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <16 x i8>, ptr %ptr
@@ -172,46 +146,29 @@ define void @load_zext_16i8_to_16i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_16i8_to_16i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT:    vshuf4i.b $vr3, $vr0, 14
-; CHECK-NEXT:    vrepli.b $vr4, 0
-; CHECK-NEXT:    vshuf.b $vr3, $vr4, $vr3, $vr1
-; CHECK-NEXT:    vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT:    vld $vr5, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT:    vld $vr6, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT:    vshuf.b $vr2, $vr4, $vr2, $vr1
-; CHECK-NEXT:    vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT:    vshuf.b $vr5, $vr4, $vr5, $vr1
-; CHECK-NEXT:    vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_4)
-; CHECK-NEXT:    vld $vr7, $a0, %pc_lo12(.LCPI8_4)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_5)
-; CHECK-NEXT:    vld $vr8, $a0, %pc_lo12(.LCPI8_5)
-; CHECK-NEXT:    vshuf.b $vr6, $vr4, $vr6, $vr1
-; CHECK-NEXT:    vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT:    vshuf.b $vr7, $vr4, $vr7, $vr1
-; CHECK-NEXT:    vshuf.b $vr8, $vr0, $vr0, $vr8
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_6)
-; CHECK-NEXT:    vld $vr9, $a0, %pc_lo12(.LCPI8_6)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_7)
-; CHECK-NEXT:    vld $vr10, $a0, %pc_lo12(.LCPI8_7)
-; CHECK-NEXT:    vshuf.b $vr8, $vr4, $vr8, $vr1
-; CHECK-NEXT:    vshuf.b $vr9, $vr0, $vr0, $vr9
-; CHECK-NEXT:    vshuf.b $vr1, $vr4, $vr9, $vr1
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr4, $vr10
-; CHECK-NEXT:    vst $vr0, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 112
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT:    vilvl.w $vr4, $vr1, $vr3
+; CHECK-NEXT:    vilvh.w $vr3, $vr1, $vr3
+; CHECK-NEXT:    vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvl.w $vr5, $vr1, $vr2
+; CHECK-NEXT:    vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.h $vr6, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr7, $vr1, $vr6
+; CHECK-NEXT:    vilvh.w $vr6, $vr1, $vr6
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr8, $vr1, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 112
 ; CHECK-NEXT:    vst $vr8, $a1, 96
-; CHECK-NEXT:    vst $vr7, $a1, 80
-; CHECK-NEXT:    vst $vr6, $a1, 64
-; CHECK-NEXT:    vst $vr5, $a1, 48
-; CHECK-NEXT:    vst $vr2, $a1, 32
+; CHECK-NEXT:    vst $vr6, $a1, 80
+; CHECK-NEXT:    vst $vr7, $a1, 64
+; CHECK-NEXT:    vst $vr2, $a1, 48
+; CHECK-NEXT:    vst $vr5, $a1, 32
 ; CHECK-NEXT:    vst $vr3, $a1, 16
+; CHECK-NEXT:    vst $vr4, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <16 x i8>, ptr %ptr
@@ -224,18 +181,11 @@ define void @load_zext_8i16_to_8i32(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_8i16_to_8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI9_1)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_2)
-; CHECK-NEXT:    vld $vr3, $a0, %pc_lo12(.LCPI9_2)
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT:    vrepli.b $vr4, 0
-; CHECK-NEXT:    vshuf.h $vr2, $vr4, $vr1
-; CHECK-NEXT:    vshuf.h $vr3, $vr0, $vr4
-; CHECK-NEXT:    vst $vr3, $a1, 0
-; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 16
+; CHECK-NEXT:    vst $vr2, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <8 x i16>, ptr %ptr
@@ -248,28 +198,17 @@ define void @load_zext_8i16_to_8i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_8i16_to_8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT:    vshuf4i.h $vr3, $vr0, 14
-; CHECK-NEXT:    vrepli.b $vr4, 0
-; CHECK-NEXT:    vori.b $vr5, $vr1, 0
-; CHECK-NEXT:    vshuf.h $vr5, $vr4, $vr3
-; CHECK-NEXT:    vshuf.h $vr2, $vr0, $vr0
-; CHECK-NEXT:    vori.b $vr3, $vr1, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT:    vld $vr6, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT:    vld $vr7, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT:    vshuf.h $vr3, $vr4, $vr2
-; CHECK-NEXT:    vshuf.h $vr6, $vr0, $vr0
-; CHECK-NEXT:    vshuf.h $vr1, $vr4, $vr6
-; CHECK-NEXT:    vshuf.h $vr7, $vr0, $vr4
-; CHECK-NEXT:    vst $vr7, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 48
-; CHECK-NEXT:    vst $vr3, $a1, 32
-; CHECK-NEXT:    vst $vr5, $a1, 16
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr3, $vr1, $vr2
+; CHECK-NEXT:    vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vilvl.w $vr4, $vr1, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 48
+; CHECK-NEXT:    vst $vr4, $a1, 32
+; CHECK-NEXT:    vst $vr2, $a1, 16
+; CHECK-NEXT:    vst $vr3, $a1, 0
 ; CHECK-NEXT:    ret
 entry:
   %A = load <8 x i16>, ptr %ptr
@@ -282,16 +221,11 @@ define void @load_zext_4i32_to_4i64(ptr %ptr, ptr %dst) {
 ; CHECK-LABEL: load_zext_4i32_to_4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT:    vld $vr2, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT:    vshuf4i.w $vr3, $vr0, 14
-; CHECK-NEXT:    vrepli.b $vr4, 0
-; CHECK-NEXT:    vshuf.w $vr1, $vr4, $vr3
-; CHECK-NEXT:    vshuf.w $vr2, $vr0, $vr4
+; CHECK-NEXT:    vrepli.b $vr1, 0
+; CHECK-NEXT:    vilvl.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 16
 ; CHECK-NEXT:    vst $vr2, $a1, 0
-; CHECK-NEXT:    vst $vr1, $a1, 16
 ; CHECK-NEXT:    ret
 entry:
   %A = load <4 x i32>, ptr %ptr


        


More information about the llvm-commits mailing list