[llvm] e3bd1f2 - [LoongArch] lower vector shuffle to zero or any extend (#129485)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 23 23:29:14 PDT 2025
Author: tangaac
Date: 2025-03-24T14:29:11+08:00
New Revision: e3bd1f2b3f2ff8317e72fc828db8bcdbeb50b1aa
URL: https://github.com/llvm/llvm-project/commit/e3bd1f2b3f2ff8317e72fc828db8bcdbeb50b1aa
DIFF: https://github.com/llvm/llvm-project/commit/e3bd1f2b3f2ff8317e72fc828db8bcdbeb50b1aa.diff
LOG: [LoongArch] lower vector shuffle to zero or any extend (#129485)
Added:
Modified:
llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
llvm/lib/Target/LoongArch/LoongArchISelLowering.h
llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index dceb3c682d2df..f69019bf34f37 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -542,6 +542,132 @@ fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
return true;
}
+/// Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, APInt &KnownUndef,
+ APInt &KnownZero) {
+ int Size = Mask.size();
+ KnownUndef = KnownZero = APInt::getZero(Size);
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ int VectorSizeInBits = V1.getValueSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Size;
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+ for (int i = 0; i < Size; ++i) {
+ int M = Mask[i];
+ if (M < 0) {
+ KnownUndef.setBit(i);
+ continue;
+ }
+ if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ KnownZero.setBit(i);
+ continue;
+ }
+ }
+}
+
+/// Lower VECTOR_SHUFFLE as ZERO_EXTEND Or ANY_EXTEND (if possible).
+///
+/// For example:
+/// %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+/// <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+/// %3 = bitcast <4 x i32> %2 to <2 x i64>
+/// is lowered to:
+/// (VREPLI $v1, 0)
+/// (VILVL $v0, $v1, $v0)
+static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
+ ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ int Bits = VT.getSizeInBits();
+ int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+
+ APInt KnownUndef, KnownZero;
+ computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+ APInt Zeroable = KnownUndef | KnownZero;
+ if (Zeroable.isAllOnes())
+ return DAG.getConstant(0, DL, VT);
+
+ // Define a helper function to check a particular ext-scale and lower to it if
+ // valid.
+ auto Lower = [&](int Scale) -> SDValue {
+ SDValue InputV;
+ bool AnyExt = true;
+ int Offset = 0;
+ for (int i = 0; i < NumElements; i++) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ if (i % Scale != 0) {
+ // Each of the extended elements need to be zeroable.
+ if (!Zeroable[i])
+ return SDValue();
+
+ AnyExt = false;
+ continue;
+ }
+
+ // Each of the base elements needs to be consecutive indices into the
+ // same input vector.
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
+ InputV = V;
+ Offset = M - (i / Scale);
+
+ // These offset can't be handled
+ if (Offset % (NumElements / Scale))
+ return SDValue();
+ } else if (InputV != V)
+ return SDValue();
+
+ if (M != (Offset + (i / Scale)))
+ return SDValue(); // Non-consecutive strided elements.
+ }
+
+ // If we fail to find an input, we have a zero-shuffle which should always
+ // have already been handled.
+ if (!InputV)
+ return SDValue();
+
+ do {
+ unsigned VilVLoHi = LoongArchISD::VILVL;
+ if (Offset >= (NumElements / 2)) {
+ VilVLoHi = LoongArchISD::VILVH;
+ Offset -= (NumElements / 2);
+ }
+
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext =
+ AnyExt ? DAG.getFreeze(InputV) : DAG.getConstant(0, DL, InputVT);
+ InputV = DAG.getBitcast(InputVT, InputV);
+ InputV = DAG.getNode(VilVLoHi, DL, InputVT, Ext, InputV);
+ Scale /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Scale > 1);
+ return DAG.getBitcast(VT, InputV);
+ };
+
+ // Each iteration, try extending the elements half as much, but into twice as
+ // many elements.
+ for (int NumExtElements = Bits / 64; NumExtElements < NumElements;
+ NumExtElements *= 2) {
+ if (SDValue V = Lower(NumElements / NumExtElements))
+ return V;
+ }
+ return SDValue();
+}
+
/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
///
/// VREPLVEI performs vector broadcast based on an element specified by an
@@ -956,6 +1082,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
return Result;
+ if ((Result =
+ lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+ return Result;
if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
return Result;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f8d4cef76b955..002fad0e20759 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -271,7 +271,14 @@ class LoongArchTargetLowering : public TargetLowering {
unsigned *Fast = nullptr) const override;
bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override {
- return false;
+ if (!VT.isSimple())
+ return false;
+
+ // Not for i1 vectors
+ if (VT.getSimpleVT().getScalarType() == MVT::i1)
+ return false;
+
+ return isTypeLegal(VT.getSimpleVT());
}
bool shouldConsiderGEPOffsetSplit() const override { return true; }
bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index eaab6524c5317..984b6f3d74866 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,10 +374,11 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT: bstrpick.d $a0, $a0, 31, 0
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI24_0)
+; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI24_0)
+; CHECK-NEXT: vrepli.b $vr2, 0
+; CHECK-NEXT: vshuf.w $vr1, $vr2, $vr0
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <4 x i32>, ptr %src
%e = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
index 75639ae090661..9485df746ff1c 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
@@ -6,10 +6,10 @@ define void @load_sext_2i8_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_2i8_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.h $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
; CHECK-NEXT: vst $vr0, $a1, 0
@@ -25,10 +25,9 @@ define void @load_sext_4i8_to_4i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_4i8_to_4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
; CHECK-NEXT: vst $vr0, $a1, 0
@@ -61,10 +60,9 @@ define void @load_sext_2i16_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_2i16_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr0, $vr1
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 48
; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
; CHECK-NEXT: vst $vr0, $a1, 0
@@ -114,17 +112,14 @@ define void @load_sext_16i8_to_16i16(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_16i8_to_16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vilvl.b $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
; CHECK-NEXT: vslli.h $vr1, $vr1, 8
; CHECK-NEXT: vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.h $vr0, $vr0, 8
; CHECK-NEXT: vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -137,33 +132,24 @@ define void @load_sext_16i8_to_16i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_16i8_to_16i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI7_2)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr1, $vr2
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT: vslli.w $vr2, $vr2, 24
+; CHECK-NEXT: vsrai.w $vr2, $vr2, 24
+; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1
; CHECK-NEXT: vslli.w $vr1, $vr1, 24
; CHECK-NEXT: vsrai.w $vr1, $vr1, 24
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_3)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI7_3)
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr3, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr3, $vr3, 24
; CHECK-NEXT: vsrai.w $vr3, $vr3, 24
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr4, $vr2
-; CHECK-NEXT: vslli.w $vr4, $vr4, 24
-; CHECK-NEXT: vsrai.w $vr4, $vr4, 24
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr2
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr4, $a1, 48
+; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr3, $a1, 32
; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -176,59 +162,44 @@ define void @load_sext_16i8_to_16i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_16i8_to_16i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: vshuf4i.b $vr2, $vr0, 14
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: vshuf.b $vr2, $vr0, $vr2, $vr1
-; CHECK-NEXT: vslli.d $vr2, $vr2, 56
-; CHECK-NEXT: vsrai.d $vr2, $vr2, 56
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr3, $vr1
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2
; CHECK-NEXT: vslli.d $vr3, $vr3, 56
; CHECK-NEXT: vsrai.d $vr3, $vr3, 56
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr4, $vr1
+; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2
+; CHECK-NEXT: vslli.d $vr2, $vr2, 56
+; CHECK-NEXT: vsrai.d $vr2, $vr2, 56
+; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr1
; CHECK-NEXT: vslli.d $vr4, $vr4, 56
; CHECK-NEXT: vsrai.d $vr4, $vr4, 56
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_4)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI8_4)
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr5, $vr1
-; CHECK-NEXT: vslli.d $vr5, $vr5, 56
-; CHECK-NEXT: vsrai.d $vr5, $vr5, 56
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_5)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI8_5)
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr6, $vr1
+; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT: vslli.d $vr1, $vr1, 56
+; CHECK-NEXT: vsrai.d $vr1, $vr1, 56
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr5, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr6, $vr5, $vr5
; CHECK-NEXT: vslli.d $vr6, $vr6, 56
; CHECK-NEXT: vsrai.d $vr6, $vr6, 56
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_6)
-; CHECK-NEXT: vld $vr8, $a0, %pc_lo12(.LCPI8_6)
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr7, $vr1
+; CHECK-NEXT: vilvh.w $vr5, $vr5, $vr5
+; CHECK-NEXT: vslli.d $vr5, $vr5, 56
+; CHECK-NEXT: vsrai.d $vr5, $vr5, 56
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr7, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr7, $vr7, 56
; CHECK-NEXT: vsrai.d $vr7, $vr7, 56
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr0, $vr8
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr8, $vr1
-; CHECK-NEXT: vslli.d $vr8, $vr8, 56
-; CHECK-NEXT: vsrai.d $vr8, $vr8, 56
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr8, $a1, 112
+; CHECK-NEXT: vst $vr0, $a1, 112
; CHECK-NEXT: vst $vr7, $a1, 96
-; CHECK-NEXT: vst $vr6, $a1, 80
-; CHECK-NEXT: vst $vr5, $a1, 64
-; CHECK-NEXT: vst $vr4, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
+; CHECK-NEXT: vst $vr5, $a1, 80
+; CHECK-NEXT: vst $vr6, $a1, 64
+; CHECK-NEXT: vst $vr1, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -241,17 +212,14 @@ define void @load_sext_8i16_to_8i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_8i16_to_8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vilvl.h $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr1, $vr1, 16
; CHECK-NEXT: vsrai.w $vr1, $vr1, 16
-; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 16
; CHECK-NEXT: vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -264,34 +232,24 @@ define void @load_sext_8i16_to_8i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_8i16_to_8i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: vshuf4i.h $vr2, $vr0, 14
-; CHECK-NEXT: vori.b $vr3, $vr1, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr2
-; CHECK-NEXT: vslli.d $vr2, $vr3, 48
+; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr1
+; CHECK-NEXT: vslli.d $vr2, $vr2, 48
; CHECK-NEXT: vsrai.d $vr2, $vr2, 48
-; CHECK-NEXT: vshuf.h $vr4, $vr0, $vr0
-; CHECK-NEXT: vori.b $vr3, $vr1, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr4
+; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT: vslli.d $vr1, $vr1, 48
+; CHECK-NEXT: vsrai.d $vr1, $vr1, 48
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr3, $vr3, 48
; CHECK-NEXT: vsrai.d $vr3, $vr3, 48
-; CHECK-NEXT: vshuf.h $vr5, $vr0, $vr0
-; CHECK-NEXT: vori.b $vr4, $vr1, 0
-; CHECK-NEXT: vshuf.h $vr4, $vr0, $vr5
-; CHECK-NEXT: vslli.d $vr4, $vr4, 48
-; CHECK-NEXT: vsrai.d $vr4, $vr4, 48
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vslli.d $vr0, $vr1, 48
+; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0
+; CHECK-NEXT: vslli.d $vr0, $vr0, 48
; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr4, $a1, 48
+; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -304,15 +262,14 @@ define void @load_sext_4i32_to_4i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_4i32_to_4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT: vshuf4i.w $vr1, $vr1, 16
+; CHECK-NEXT: vshuf4i.w $vr1, $vr0, 16
; CHECK-NEXT: vslli.d $vr1, $vr1, 32
; CHECK-NEXT: vsrai.d $vr1, $vr1, 32
-; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16
+; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 50
; CHECK-NEXT: vslli.d $vr0, $vr0, 32
; CHECK-NEXT: vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <4 x i32>, ptr %ptr
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
index e95a69775e3aa..9b1b584bd9c76 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll
@@ -5,10 +5,10 @@ define void @shuffle_any_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_any_ext_2i8_to_2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.h $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <2 x i8>, ptr %ptr
@@ -22,10 +22,9 @@ define void @shuffle_any_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_any_ext_2i16_to_2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr0, $vr1
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <2 x i16>, ptr %ptr
@@ -54,10 +53,9 @@ define void @shuffle_any_ext_4i8_to_4i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_any_ext_4i8_to_4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %ptr
@@ -133,22 +131,16 @@ define void @shuffle_any_ext_8i16_to_8i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_any_ext_8i16_to_8i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vshuf.h $vr2, $vr0, $vr0
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr0
-; CHECK-NEXT: vshuf.h $vr4, $vr0, $vr0
-; CHECK-NEXT: vst $vr4, $a1, 48
+; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr1
+; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr2, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
%x = load <8 x i16>, ptr %ptr
%y = shufflevector <8 x i16> %x, <8 x i16> poison, <32 x i32> <i32 0, i32 15, i32 15, i32 15, i32 1, i32 14, i32 14, i32 14, i32 2, i32 13, i32 13, i32 13, i32 3, i32 12, i32 12, i32 12, i32 4, i32 11, i32 11, i32 11, i32 5, i32 10, i32 10, i32 10, i32 6, i32 9, i32 9, i32 9, i32 7, i32 8, i32 8, i32 8>
@@ -177,22 +169,16 @@ define void @shuffle_any_ext_16i8_to_16i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_any_ext_16i8_to_16i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr4
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr0, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr2, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
%x = load <16 x i8>, ptr %ptr
%y = shufflevector <16 x i8> %x, <16 x i8> poison, <64 x i32> <i32 0, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16>
@@ -205,38 +191,28 @@ define void @shuffle_any_ext_16i8_to_16i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_any_ext_16i8_to_16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI11_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_3)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI11_3)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_4)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI11_4)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_5)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI11_5)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_6)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI11_6)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_7)
-; CHECK-NEXT: vld $vr8, $a0, %pc_lo12(.LCPI11_7)
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr8
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2
+; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr1
+; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr5, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr6, $vr5, $vr5
+; CHECK-NEXT: vilvh.w $vr5, $vr5, $vr5
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr7, $vr0, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0
; CHECK-NEXT: vst $vr0, $a1, 112
; CHECK-NEXT: vst $vr7, $a1, 96
-; CHECK-NEXT: vst $vr6, $a1, 80
-; CHECK-NEXT: vst $vr5, $a1, 64
-; CHECK-NEXT: vst $vr4, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
+; CHECK-NEXT: vst $vr5, $a1, 80
+; CHECK-NEXT: vst $vr6, $a1, 64
+; CHECK-NEXT: vst $vr1, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
; CHECK-NEXT: vst $vr2, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
%x = load <16 x i8>, ptr %ptr
%y = shufflevector <16 x i8> %x, <16 x i8> poison, <128 x i32> <i32 0, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
index b1ad16a35251b..7e9f5b653d01a 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-sign-ext.ll
@@ -5,11 +5,11 @@ define void @shuffle_sign_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_2i8_to_2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.h $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <2 x i8>, ptr %ptr
@@ -23,11 +23,10 @@ define void @shuffle_sign_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_2i16_to_2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <2 x i16>, ptr %ptr
@@ -41,11 +40,9 @@ define void @shuffle_sign_ext_2i32_to_2i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_2i32_to_2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.w $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <2 x i32>, ptr %ptr
@@ -59,11 +56,10 @@ define void @shuffle_sign_ext_4i8_to_4i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_4i8_to_4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <4 x i8>, ptr %ptr
@@ -77,11 +73,9 @@ define void @shuffle_sign_ext_4i16_to_4i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_4i16_to_4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <4 x i16>, ptr %ptr
@@ -95,11 +89,9 @@ define void @shuffle_sign_ext_8i8_to_8i16(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_8i8_to_8i16:
; CHECK: # %bb.0:
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
%x = load <8 x i8>, ptr %ptr
@@ -113,15 +105,11 @@ define void @shuffle_sign_ext_4i32_to_4i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_4i32_to_4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT: vrepli.b $vr3, 0
-; CHECK-NEXT: vshuf.w $vr1, $vr0, $vr3
-; CHECK-NEXT: vshuf.w $vr2, $vr0, $vr3
-; CHECK-NEXT: vst $vr2, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
%x = load <4 x i32>, ptr %ptr
%y = shufflevector <4 x i32> %x, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
@@ -134,15 +122,11 @@ define void @shuffle_sign_ext_8i16_to_8i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_8i16_to_8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT: vrepli.b $vr3, 0
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr3
-; CHECK-NEXT: vshuf.h $vr2, $vr0, $vr3
-; CHECK-NEXT: vst $vr2, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
%x = load <8 x i16>, ptr %ptr
%y = shufflevector <8 x i16> %x, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 15, i32 1, i32 14, i32 2, i32 13, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>
@@ -155,23 +139,17 @@ define void @shuffle_sign_ext_8i16_to_8i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_8i16_to_8i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr2
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr2
-; CHECK-NEXT: vshuf.h $vr4, $vr0, $vr2
-; CHECK-NEXT: vshuf.h $vr5, $vr0, $vr2
-; CHECK-NEXT: vst $vr5, $a1, 48
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr4, $a1, 32
-; CHECK-NEXT: vst $vr3, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
%x = load <8 x i16>, ptr %ptr
%y = shufflevector <8 x i16> %x, <8 x i16> zeroinitializer, <32 x i32> <i32 0, i32 15, i32 15, i32 15, i32 1, i32 14, i32 14, i32 14, i32 2, i32 13, i32 13, i32 13, i32 3, i32 12, i32 12, i32 12, i32 4, i32 11, i32 11, i32 11, i32 5, i32 10, i32 10, i32 10, i32 6, i32 9, i32 9, i32 9, i32 7, i32 8, i32 8, i32 8>
@@ -184,15 +162,11 @@ define void @shuffle_sign_ext_16i8_to_16i16(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_16i8_to_16i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI9_1)
-; CHECK-NEXT: vrepli.b $vr3, 0
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr3, $vr1
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr3, $vr2
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
%x = load <16 x i8>, ptr %ptr
%y = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 31, i32 1, i32 30, i32 2, i32 29, i32 3, i32 28, i32 4, i32 27, i32 5, i32 26, i32 6, i32 25, i32 7, i32 24, i32 8, i32 23, i32 9, i32 22, i32 10, i32 21, i32 11, i32 20, i32 12, i32 19, i32 13, i32 18, i32 14, i32 17, i32 15, i32 16 >
@@ -205,23 +179,17 @@ define void @shuffle_sign_ext_16i8_to_16i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_16i8_to_16i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr2, $vr1
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr2, $vr3
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr2, $vr4
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr2, $vr5
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr4, $vr1, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr4, $a1, 32
-; CHECK-NEXT: vst $vr3, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
%x = load <16 x i8>, ptr %ptr
%y = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <64 x i32> <i32 0, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16>
@@ -234,39 +202,29 @@ define void @shuffle_sign_ext_16i8_to_16i64(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: shuffle_sign_ext_16i8_to_16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_2)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI11_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI11_3)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr2, $vr1
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr2, $vr3
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr2, $vr4
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr2, $vr5
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_4)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI11_4)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_5)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI11_5)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_6)
-; CHECK-NEXT: vld $vr8, $a0, %pc_lo12(.LCPI11_6)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_7)
-; CHECK-NEXT: vld $vr9, $a0, %pc_lo12(.LCPI11_7)
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr2, $vr6
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr2, $vr7
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr2, $vr8
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr2, $vr9
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr3
+; CHECK-NEXT: vilvh.w $vr3, $vr1, $vr3
+; CHECK-NEXT: vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvl.w $vr5, $vr1, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr6, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr7, $vr1, $vr6
+; CHECK-NEXT: vilvh.w $vr6, $vr1, $vr6
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr8, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 112
; CHECK-NEXT: vst $vr8, $a1, 96
-; CHECK-NEXT: vst $vr7, $a1, 80
-; CHECK-NEXT: vst $vr6, $a1, 64
-; CHECK-NEXT: vst $vr5, $a1, 48
-; CHECK-NEXT: vst $vr4, $a1, 32
+; CHECK-NEXT: vst $vr6, $a1, 80
+; CHECK-NEXT: vst $vr7, $a1, 64
+; CHECK-NEXT: vst $vr2, $a1, 48
+; CHECK-NEXT: vst $vr5, $a1, 32
; CHECK-NEXT: vst $vr3, $a1, 16
-; CHECK-NEXT: vst $vr1, $a1, 0
+; CHECK-NEXT: vst $vr4, $a1, 0
; CHECK-NEXT: ret
%x = load <16 x i8>, ptr %ptr
%y = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <128 x i32> <i32 0, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 1, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 2, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 3, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 4, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 5, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 6, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 7, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 8, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 9, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 10, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 11, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 12, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 13, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 14, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
index 85e89e84464ef..f5bff59a26698 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
@@ -4,15 +4,14 @@
define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: load_trunc_2i64_to_2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
+; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT: st.w $a0, $sp, 4
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vpackev.w $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
-; CHECK-NEXT: st.w $a0, $sp, 0
-; CHECK-NEXT: ld.d $a0, $sp, 0
; CHECK-NEXT: st.d $a0, $a1, 0
-; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
%a = load <2 x i64>, ptr %ptr
%trunc = trunc <2 x i64> %a to <2 x i32>
@@ -23,16 +22,14 @@ define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind {
define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: load_trunc_2i64_to_2i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT: st.h $a0, $sp, 2
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
-; CHECK-NEXT: st.h $a0, $sp, 0
-; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
+; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vpackev.h $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0
; CHECK-NEXT: st.w $a0, $a1, 0
-; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
%a = load <2 x i64>, ptr %ptr
%trunc = trunc <2 x i64> %a to <2 x i16>
@@ -43,16 +40,14 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind {
define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: load_trunc_2i64_to_2i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT: st.b $a0, $sp, 1
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
-; CHECK-NEXT: st.b $a0, $sp, 0
-; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a0, 0
+; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 0
+; CHECK-NEXT: vpackev.b $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0
; CHECK-NEXT: st.h $a0, $a1, 0
-; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
%a = load <2 x i64>, ptr %ptr
%trunc = trunc <2 x i64> %a to <2 x i8>
@@ -141,17 +136,15 @@ define void @load_trunc_8i16_to_8i8(ptr %ptr, ptr %dst) nounwind {
define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: load_trunc_2i32_to_2i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: ld.d $a0, $a0, 0
; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT: st.h $a0, $sp, 2
; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0
-; CHECK-NEXT: st.h $a0, $sp, 0
-; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
+; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vpackev.h $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0
; CHECK-NEXT: st.w $a0, $a1, 0
-; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
%a = load <2 x i32>, ptr %ptr
%trunc = trunc <2 x i32> %a to <2 x i16>
@@ -162,17 +155,15 @@ define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind {
define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: load_trunc_2i32_to_2i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: ld.d $a0, $a0, 0
; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT: st.b $a0, $sp, 1
; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0
-; CHECK-NEXT: st.b $a0, $sp, 0
-; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a0, 0
+; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 0
+; CHECK-NEXT: vpackev.b $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0
; CHECK-NEXT: st.h $a0, $a1, 0
-; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
%a = load <2 x i32>, ptr %ptr
%trunc = trunc <2 x i32> %a to <2 x i8>
@@ -208,17 +199,15 @@ define void @load_trunc_4i16_to_4i8(ptr %ptr, ptr %dst) nounwind {
define void @load_trunc_2i16_to_2i8(ptr %ptr, ptr %dst) nounwind {
; CHECK-LABEL: load_trunc_2i16_to_2i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: ld.w $a0, $a0, 0
; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 1
-; CHECK-NEXT: st.b $a0, $sp, 1
; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0
-; CHECK-NEXT: st.b $a0, $sp, 0
-; CHECK-NEXT: vld $vr0, $sp, 0
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a0, 0
+; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 1
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 0
+; CHECK-NEXT: vpackev.b $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0
; CHECK-NEXT: st.h $a0, $a1, 0
-; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ret
%a = load <2 x i16>, ptr %ptr
%trunc = trunc <2 x i16> %a to <2 x i8>
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
index 3cc9b62d76736..bbcfbe1b07260 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
@@ -6,11 +6,11 @@ define void @load_zext_2i8_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_2i8_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.h $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -24,11 +24,10 @@ define void @load_zext_4i8_to_4i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_4i8_to_4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -42,11 +41,9 @@ define void @load_zext_8i8_to_8i16(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_8i8_to_8i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -60,11 +57,10 @@ define void @load_zext_2i16_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_2i16_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -78,11 +74,9 @@ define void @load_zext_4i16_to_4i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_4i16_to_4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -96,11 +90,9 @@ define void @load_zext_2i32_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_2i32_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.w $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -114,18 +106,11 @@ define void @load_zext_16i8_to_16i16(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_16i8_to_16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI6_2)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.b $vr1, $vr4, $vr1, $vr2
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr4, $vr3
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -138,28 +123,17 @@ define void @load_zext_16i8_to_16i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_16i8_to_16i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI7_2)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.b $vr1, $vr4, $vr1, $vr2
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI7_3)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_4)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI7_4)
-; CHECK-NEXT: vshuf.b $vr3, $vr4, $vr3, $vr2
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: vshuf.b $vr2, $vr4, $vr5, $vr2
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr4, $vr6
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr2, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr4, $vr1, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
+; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -172,46 +146,29 @@ define void @load_zext_16i8_to_16i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_16i8_to_16i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: vshuf4i.b $vr3, $vr0, 14
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.b $vr3, $vr4, $vr3, $vr1
-; CHECK-NEXT: vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT: vshuf.b $vr2, $vr4, $vr2, $vr1
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: vshuf.b $vr5, $vr4, $vr5, $vr1
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_4)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI8_4)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_5)
-; CHECK-NEXT: vld $vr8, $a0, %pc_lo12(.LCPI8_5)
-; CHECK-NEXT: vshuf.b $vr6, $vr4, $vr6, $vr1
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT: vshuf.b $vr7, $vr4, $vr7, $vr1
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr0, $vr8
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_6)
-; CHECK-NEXT: vld $vr9, $a0, %pc_lo12(.LCPI8_6)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_7)
-; CHECK-NEXT: vld $vr10, $a0, %pc_lo12(.LCPI8_7)
-; CHECK-NEXT: vshuf.b $vr8, $vr4, $vr8, $vr1
-; CHECK-NEXT: vshuf.b $vr9, $vr0, $vr0, $vr9
-; CHECK-NEXT: vshuf.b $vr1, $vr4, $vr9, $vr1
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr4, $vr10
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 112
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr3
+; CHECK-NEXT: vilvh.w $vr3, $vr1, $vr3
+; CHECK-NEXT: vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvl.w $vr5, $vr1, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr6, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr7, $vr1, $vr6
+; CHECK-NEXT: vilvh.w $vr6, $vr1, $vr6
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr8, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 112
; CHECK-NEXT: vst $vr8, $a1, 96
-; CHECK-NEXT: vst $vr7, $a1, 80
-; CHECK-NEXT: vst $vr6, $a1, 64
-; CHECK-NEXT: vst $vr5, $a1, 48
-; CHECK-NEXT: vst $vr2, $a1, 32
+; CHECK-NEXT: vst $vr6, $a1, 80
+; CHECK-NEXT: vst $vr7, $a1, 64
+; CHECK-NEXT: vst $vr2, $a1, 48
+; CHECK-NEXT: vst $vr5, $a1, 32
; CHECK-NEXT: vst $vr3, $a1, 16
+; CHECK-NEXT: vst $vr4, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -224,18 +181,11 @@ define void @load_zext_8i16_to_8i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_8i16_to_8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI9_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI9_2)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.h $vr2, $vr4, $vr1
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr4
-; CHECK-NEXT: vst $vr3, $a1, 0
-; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -248,28 +198,17 @@ define void @load_zext_8i16_to_8i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_8i16_to_8i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: vshuf4i.h $vr3, $vr0, 14
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vori.b $vr5, $vr1, 0
-; CHECK-NEXT: vshuf.h $vr5, $vr4, $vr3
-; CHECK-NEXT: vshuf.h $vr2, $vr0, $vr0
-; CHECK-NEXT: vori.b $vr3, $vr1, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT: vshuf.h $vr3, $vr4, $vr2
-; CHECK-NEXT: vshuf.h $vr6, $vr0, $vr0
-; CHECK-NEXT: vshuf.h $vr1, $vr4, $vr6
-; CHECK-NEXT: vshuf.h $vr7, $vr0, $vr4
-; CHECK-NEXT: vst $vr7, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr5, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
+; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -282,16 +221,11 @@ define void @load_zext_4i32_to_4i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_4i32_to_4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT: vshuf4i.w $vr3, $vr0, 14
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.w $vr1, $vr4, $vr3
-; CHECK-NEXT: vshuf.w $vr2, $vr0, $vr4
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
; CHECK-NEXT: vst $vr2, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
; CHECK-NEXT: ret
entry:
%A = load <4 x i32>, ptr %ptr
More information about the llvm-commits
mailing list