[llvm] [LoongArch] lower vectorshuffle to zero or any extend (PR #129485)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 9 21:51:06 PDT 2025
https://github.com/tangaac updated https://github.com/llvm/llvm-project/pull/129485
>From 94caefb81c62df594f9e7396d9714d9e5c7c7297 Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Mon, 3 Mar 2025 15:06:56 +0800
Subject: [PATCH 1/4] lower vectorshuffle to zero or any extend
---
.../LoongArch/LoongArchISelLowering.cpp | 132 +++++++++++
.../Target/LoongArch/LoongArchISelLowering.h | 2 +-
.../CodeGen/LoongArch/lsx/build-vector.ll | 9 +-
llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll | 185 ++++++---------
llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll | 224 ++++++------------
5 files changed, 288 insertions(+), 264 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index dceb3c682d2df..ea56c28fe201e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -542,6 +542,135 @@ fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
return true;
}
+/// Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero.
+static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, APInt &KnownUndef,
+ APInt &KnownZero) {
+ int Size = Mask.size();
+ KnownUndef = KnownZero = APInt::getZero(Size);
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ int VectorSizeInBits = V1.getValueSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Size;
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+ for (int i = 0; i < Size; ++i) {
+ int M = Mask[i];
+ if (M < 0) {
+ KnownUndef.setBit(i);
+ continue;
+ }
+ if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ KnownZero.setBit(i);
+ continue;
+ }
+ }
+}
+
+/// Lower VECTOR_SHUFFLE as ZERO_EXTEND Or ANY_EXTEND (if possible).
+///
+/// For example:
+/// %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer,
+/// <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+/// %3 = bitcast <4 x i32> %2 to <2 x i64>
+/// is lowered to:
+/// (VREPLI $v1, 0)
+/// (VILVL $v0, $v1, $v0)
+static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
+ ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ int Bits = VT.getSizeInBits();
+ int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+
+ APInt KnownUndef, KnownZero;
+ computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
+ APInt Zeroable = KnownUndef | KnownZero;
+ if (Zeroable.isAllOnes())
+ return DAG.getConstant(0, DL, VT);
+
+ // Define a helper function to check a particular ext-scale and lower to it if
+ // valid.
+ auto Lower = [&](int Scale) -> SDValue {
+ SDValue InputV;
+ bool AnyExt = true;
+ int Offset = 0;
+ for (int i = 0; i < NumElements; i++) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ if (i % Scale != 0) {
+ // Each of the extended elements need to be zeroable.
+ if (!Zeroable[i])
+ return SDValue();
+
+ AnyExt = false;
+ continue;
+ }
+
+ // Each of the base elements needs to be consecutive indices into the
+ // same input vector.
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
+ InputV = V;
+ Offset = M - (i / Scale);
+
+ // These offset can't be handled
+ if (Offset % (NumElements / Scale))
+ return SDValue();
+ } else if (InputV != V)
+ return SDValue();
+
+ if (M != (Offset + (i / Scale)))
+ return SDValue(); // Non-consecutive strided elements.
+ }
+
+ // If we fail to find an input, we have a zero-shuffle which should always
+ // have already been handled.
+ if (!InputV)
+ return SDValue();
+
+ do {
+ unsigned VilVLoHi = LoongArchISD::VILVL;
+ if (Offset >= (NumElements / 2)) {
+ VilVLoHi = LoongArchISD::VILVH;
+ Offset -= (NumElements / 2);
+ }
+
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext =
+ AnyExt ? DAG.getUNDEF(InputVT) : DAG.getConstant(0, DL, InputVT);
+ InputV = DAG.getBitcast(InputVT, InputV);
+ InputV = DAG.getNode(VilVLoHi, DL, InputVT, Ext, InputV);
+ Scale /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Scale > 1);
+ return DAG.getBitcast(VT, InputV);
+ };
+
+ // Each iteration, try extending the elements half as much, but into twice as
+ // many elements.
+ for (int NumExtElements = Bits / 64; NumExtElements < NumElements;
+ NumExtElements *= 2) {
+ if (SDValue V = Lower(NumElements / NumExtElements))
+ return V;
+ }
+ return SDValue();
+}
+
/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
///
/// VREPLVEI performs vector broadcast based on an element specified by an
@@ -956,6 +1085,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return Result;
if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
return Result;
+ if ((Result =
+ lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG)))
+ return Result;
if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
return Result;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f8d4cef76b955..c30813fb176a1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -271,7 +271,7 @@ class LoongArchTargetLowering : public TargetLowering {
unsigned *Fast = nullptr) const override;
bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override {
- return false;
+ return isTypeLegal(VT.getSimpleVT());
}
bool shouldConsiderGEPOffsetSplit() const override { return true; }
bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index eaab6524c5317..984b6f3d74866 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -374,10 +374,11 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT: bstrpick.d $a0, $a0, 31, 0
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI24_0)
+; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI24_0)
+; CHECK-NEXT: vrepli.b $vr2, 0
+; CHECK-NEXT: vshuf.w $vr1, $vr2, $vr0
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
%v = load volatile <4 x i32>, ptr %src
%e = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
index 75639ae090661..3b6986aec5b6a 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
@@ -6,10 +6,10 @@ define void @load_sext_2i8_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_2i8_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.h $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
; CHECK-NEXT: vst $vr0, $a1, 0
@@ -25,10 +25,9 @@ define void @load_sext_4i8_to_4i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_4i8_to_4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
; CHECK-NEXT: vst $vr0, $a1, 0
@@ -61,10 +60,9 @@ define void @load_sext_2i16_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_2i16_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr0, $vr1
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 48
; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
; CHECK-NEXT: vst $vr0, $a1, 0
@@ -114,17 +112,14 @@ define void @load_sext_16i8_to_16i16(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_16i8_to_16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vilvl.b $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
; CHECK-NEXT: vslli.h $vr1, $vr1, 8
; CHECK-NEXT: vsrai.h $vr1, $vr1, 8
-; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.h $vr0, $vr0, 8
; CHECK-NEXT: vsrai.h $vr0, $vr0, 8
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -137,33 +132,24 @@ define void @load_sext_16i8_to_16i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_16i8_to_16i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI7_2)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr1, $vr2
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr2, $vr0, $vr1
+; CHECK-NEXT: vslli.w $vr2, $vr2, 24
+; CHECK-NEXT: vsrai.w $vr2, $vr2, 24
+; CHECK-NEXT: vilvh.h $vr1, $vr0, $vr1
; CHECK-NEXT: vslli.w $vr1, $vr1, 24
; CHECK-NEXT: vsrai.w $vr1, $vr1, 24
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_3)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI7_3)
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr3, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr3, $vr3, 24
; CHECK-NEXT: vsrai.w $vr3, $vr3, 24
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr4, $vr2
-; CHECK-NEXT: vslli.w $vr4, $vr4, 24
-; CHECK-NEXT: vsrai.w $vr4, $vr4, 24
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr2
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr4, $a1, 48
+; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr3, $a1, 32
; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -176,59 +162,44 @@ define void @load_sext_16i8_to_16i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_16i8_to_16i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: vshuf4i.b $vr2, $vr0, 14
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: vshuf.b $vr2, $vr0, $vr2, $vr1
-; CHECK-NEXT: vslli.d $vr2, $vr2, 56
-; CHECK-NEXT: vsrai.d $vr2, $vr2, 56
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr3, $vr1
+; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr2, $vr0, $vr1
+; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr2
; CHECK-NEXT: vslli.d $vr3, $vr3, 56
; CHECK-NEXT: vsrai.d $vr3, $vr3, 56
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr0, $vr4
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT: vshuf.b $vr4, $vr0, $vr4, $vr1
+; CHECK-NEXT: vilvh.w $vr2, $vr0, $vr2
+; CHECK-NEXT: vslli.d $vr2, $vr2, 56
+; CHECK-NEXT: vsrai.d $vr2, $vr2, 56
+; CHECK-NEXT: vilvh.h $vr1, $vr0, $vr1
+; CHECK-NEXT: vilvl.w $vr4, $vr0, $vr1
; CHECK-NEXT: vslli.d $vr4, $vr4, 56
; CHECK-NEXT: vsrai.d $vr4, $vr4, 56
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_4)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI8_4)
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr5, $vr1
-; CHECK-NEXT: vslli.d $vr5, $vr5, 56
-; CHECK-NEXT: vsrai.d $vr5, $vr5, 56
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_5)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI8_5)
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr6, $vr1
+; CHECK-NEXT: vilvh.w $vr1, $vr0, $vr1
+; CHECK-NEXT: vslli.d $vr1, $vr1, 56
+; CHECK-NEXT: vsrai.d $vr1, $vr1, 56
+; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.h $vr5, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr6, $vr0, $vr5
; CHECK-NEXT: vslli.d $vr6, $vr6, 56
; CHECK-NEXT: vsrai.d $vr6, $vr6, 56
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_6)
-; CHECK-NEXT: vld $vr8, $a0, %pc_lo12(.LCPI8_6)
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr7, $vr1
+; CHECK-NEXT: vilvh.w $vr5, $vr0, $vr5
+; CHECK-NEXT: vslli.d $vr5, $vr5, 56
+; CHECK-NEXT: vsrai.d $vr5, $vr5, 56
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr7, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr7, $vr7, 56
; CHECK-NEXT: vsrai.d $vr7, $vr7, 56
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr0, $vr8
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr8, $vr1
-; CHECK-NEXT: vslli.d $vr8, $vr8, 56
-; CHECK-NEXT: vsrai.d $vr8, $vr8, 56
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr8, $a1, 112
+; CHECK-NEXT: vst $vr0, $a1, 112
; CHECK-NEXT: vst $vr7, $a1, 96
-; CHECK-NEXT: vst $vr6, $a1, 80
-; CHECK-NEXT: vst $vr5, $a1, 64
-; CHECK-NEXT: vst $vr4, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
+; CHECK-NEXT: vst $vr5, $a1, 80
+; CHECK-NEXT: vst $vr6, $a1, 64
+; CHECK-NEXT: vst $vr1, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -241,17 +212,14 @@ define void @load_sext_8i16_to_8i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_8i16_to_8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vilvl.h $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr1, $vr1, 16
; CHECK-NEXT: vsrai.w $vr1, $vr1, 16
-; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
; CHECK-NEXT: vslli.w $vr0, $vr0, 16
; CHECK-NEXT: vsrai.w $vr0, $vr0, 16
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -264,34 +232,24 @@ define void @load_sext_8i16_to_8i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_8i16_to_8i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: vshuf4i.h $vr2, $vr0, 14
-; CHECK-NEXT: vori.b $vr3, $vr1, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: vld $vr4, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr2
-; CHECK-NEXT: vslli.d $vr2, $vr3, 48
+; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr2, $vr0, $vr1
+; CHECK-NEXT: vslli.d $vr2, $vr2, 48
; CHECK-NEXT: vsrai.d $vr2, $vr2, 48
-; CHECK-NEXT: vshuf.h $vr4, $vr0, $vr0
-; CHECK-NEXT: vori.b $vr3, $vr1, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr4
+; CHECK-NEXT: vilvh.w $vr1, $vr0, $vr1
+; CHECK-NEXT: vslli.d $vr1, $vr1, 48
+; CHECK-NEXT: vsrai.d $vr1, $vr1, 48
+; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
+; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr0
; CHECK-NEXT: vslli.d $vr3, $vr3, 48
; CHECK-NEXT: vsrai.d $vr3, $vr3, 48
-; CHECK-NEXT: vshuf.h $vr5, $vr0, $vr0
-; CHECK-NEXT: vori.b $vr4, $vr1, 0
-; CHECK-NEXT: vshuf.h $vr4, $vr0, $vr5
-; CHECK-NEXT: vslli.d $vr4, $vr4, 48
-; CHECK-NEXT: vsrai.d $vr4, $vr4, 48
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vslli.d $vr0, $vr1, 48
+; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0
+; CHECK-NEXT: vslli.d $vr0, $vr0, 48
; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr4, $a1, 48
+; CHECK-NEXT: vst $vr0, $a1, 48
; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -304,15 +262,14 @@ define void @load_sext_4i32_to_4i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_sext_4i32_to_4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT: vshuf4i.w $vr1, $vr1, 16
+; CHECK-NEXT: vshuf4i.w $vr1, $vr0, 16
; CHECK-NEXT: vslli.d $vr1, $vr1, 32
; CHECK-NEXT: vsrai.d $vr1, $vr1, 32
-; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16
+; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 50
; CHECK-NEXT: vslli.d $vr0, $vr0, 32
; CHECK-NEXT: vsrai.d $vr0, $vr0, 32
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr1, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <4 x i32>, ptr %ptr
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
index 3cc9b62d76736..bbcfbe1b07260 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-zext.ll
@@ -6,11 +6,11 @@ define void @load_zext_2i8_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_2i8_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.h $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -24,11 +24,10 @@ define void @load_zext_4i8_to_4i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_4i8_to_4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -42,11 +41,9 @@ define void @load_zext_8i8_to_8i16(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_8i8_to_8i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr2, $vr0
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -60,11 +57,10 @@ define void @load_zext_2i16_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_2i16_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.w $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -78,11 +74,9 @@ define void @load_zext_4i16_to_4i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_4i16_to_4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.h $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -96,11 +90,9 @@ define void @load_zext_2i32_to_2i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_2i32_to_2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld.d $a0, $a0, 0
-; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; CHECK-NEXT: vrepli.b $vr2, 0
-; CHECK-NEXT: vshuf.w $vr0, $vr1, $vr2
+; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: ret
entry:
@@ -114,18 +106,11 @@ define void @load_zext_16i8_to_16i16(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_16i8_to_16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI6_2)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.b $vr1, $vr4, $vr1, $vr2
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr4, $vr3
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -138,28 +123,17 @@ define void @load_zext_16i8_to_16i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_16i8_to_16i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI7_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI7_2)
-; CHECK-NEXT: vshuf.b $vr1, $vr0, $vr0, $vr1
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.b $vr1, $vr4, $vr1, $vr2
-; CHECK-NEXT: vshuf.b $vr3, $vr0, $vr0, $vr3
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_3)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI7_3)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_4)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI7_4)
-; CHECK-NEXT: vshuf.b $vr3, $vr4, $vr3, $vr2
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: vshuf.b $vr2, $vr4, $vr5, $vr2
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr4, $vr6
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr2, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr1, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr4, $vr1, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
+; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -172,46 +146,29 @@ define void @load_zext_16i8_to_16i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_16i8_to_16i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI8_1)
-; CHECK-NEXT: vshuf4i.b $vr3, $vr0, 14
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.b $vr3, $vr4, $vr3, $vr1
-; CHECK-NEXT: vshuf.b $vr2, $vr0, $vr0, $vr2
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_2)
-; CHECK-NEXT: vld $vr5, $a0, %pc_lo12(.LCPI8_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_3)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI8_3)
-; CHECK-NEXT: vshuf.b $vr2, $vr4, $vr2, $vr1
-; CHECK-NEXT: vshuf.b $vr5, $vr0, $vr0, $vr5
-; CHECK-NEXT: vshuf.b $vr5, $vr4, $vr5, $vr1
-; CHECK-NEXT: vshuf.b $vr6, $vr0, $vr0, $vr6
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_4)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI8_4)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_5)
-; CHECK-NEXT: vld $vr8, $a0, %pc_lo12(.LCPI8_5)
-; CHECK-NEXT: vshuf.b $vr6, $vr4, $vr6, $vr1
-; CHECK-NEXT: vshuf.b $vr7, $vr0, $vr0, $vr7
-; CHECK-NEXT: vshuf.b $vr7, $vr4, $vr7, $vr1
-; CHECK-NEXT: vshuf.b $vr8, $vr0, $vr0, $vr8
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_6)
-; CHECK-NEXT: vld $vr9, $a0, %pc_lo12(.LCPI8_6)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI8_7)
-; CHECK-NEXT: vld $vr10, $a0, %pc_lo12(.LCPI8_7)
-; CHECK-NEXT: vshuf.b $vr8, $vr4, $vr8, $vr1
-; CHECK-NEXT: vshuf.b $vr9, $vr0, $vr0, $vr9
-; CHECK-NEXT: vshuf.b $vr1, $vr4, $vr9, $vr1
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr4, $vr10
-; CHECK-NEXT: vst $vr0, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 112
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.b $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr3
+; CHECK-NEXT: vilvh.w $vr3, $vr1, $vr3
+; CHECK-NEXT: vilvh.h $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvl.w $vr5, $vr1, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.h $vr6, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr7, $vr1, $vr6
+; CHECK-NEXT: vilvh.w $vr6, $vr1, $vr6
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr8, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 112
; CHECK-NEXT: vst $vr8, $a1, 96
-; CHECK-NEXT: vst $vr7, $a1, 80
-; CHECK-NEXT: vst $vr6, $a1, 64
-; CHECK-NEXT: vst $vr5, $a1, 48
-; CHECK-NEXT: vst $vr2, $a1, 32
+; CHECK-NEXT: vst $vr6, $a1, 80
+; CHECK-NEXT: vst $vr7, $a1, 64
+; CHECK-NEXT: vst $vr2, $a1, 48
+; CHECK-NEXT: vst $vr5, $a1, 32
; CHECK-NEXT: vst $vr3, $a1, 16
+; CHECK-NEXT: vst $vr4, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <16 x i8>, ptr %ptr
@@ -224,18 +181,11 @@ define void @load_zext_8i16_to_8i32(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_8i16_to_8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI9_1)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_2)
-; CHECK-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI9_2)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.h $vr2, $vr4, $vr1
-; CHECK-NEXT: vshuf.h $vr3, $vr0, $vr4
-; CHECK-NEXT: vst $vr3, $a1, 0
-; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
+; CHECK-NEXT: vst $vr2, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -248,28 +198,17 @@ define void @load_zext_8i16_to_8i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_8i16_to_8i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: vshuf4i.h $vr3, $vr0, 14
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vori.b $vr5, $vr1, 0
-; CHECK-NEXT: vshuf.h $vr5, $vr4, $vr3
-; CHECK-NEXT: vshuf.h $vr2, $vr0, $vr0
-; CHECK-NEXT: vori.b $vr3, $vr1, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_2)
-; CHECK-NEXT: vld $vr6, $a0, %pc_lo12(.LCPI10_2)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_3)
-; CHECK-NEXT: vld $vr7, $a0, %pc_lo12(.LCPI10_3)
-; CHECK-NEXT: vshuf.h $vr3, $vr4, $vr2
-; CHECK-NEXT: vshuf.h $vr6, $vr0, $vr0
-; CHECK-NEXT: vshuf.h $vr1, $vr4, $vr6
-; CHECK-NEXT: vshuf.h $vr7, $vr0, $vr4
-; CHECK-NEXT: vst $vr7, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 48
-; CHECK-NEXT: vst $vr3, $a1, 32
-; CHECK-NEXT: vst $vr5, $a1, 16
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr3, $vr1, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr1, $vr2
+; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 48
+; CHECK-NEXT: vst $vr4, $a1, 32
+; CHECK-NEXT: vst $vr2, $a1, 16
+; CHECK-NEXT: vst $vr3, $a1, 0
; CHECK-NEXT: ret
entry:
%A = load <8 x i16>, ptr %ptr
@@ -282,16 +221,11 @@ define void @load_zext_4i32_to_4i64(ptr %ptr, ptr %dst) {
; CHECK-LABEL: load_zext_4i32_to_4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; CHECK-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI11_1)
-; CHECK-NEXT: vshuf4i.w $vr3, $vr0, 14
-; CHECK-NEXT: vrepli.b $vr4, 0
-; CHECK-NEXT: vshuf.w $vr1, $vr4, $vr3
-; CHECK-NEXT: vshuf.w $vr2, $vr0, $vr4
+; CHECK-NEXT: vrepli.b $vr1, 0
+; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr0
+; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 16
; CHECK-NEXT: vst $vr2, $a1, 0
-; CHECK-NEXT: vst $vr1, $a1, 16
; CHECK-NEXT: ret
entry:
%A = load <4 x i32>, ptr %ptr
>From 191308cd2840938a8129750820f0725dd487341e Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Mon, 3 Mar 2025 16:13:26 +0800
Subject: [PATCH 2/4] replace getUNDEF to getFreeze
---
.../LoongArch/LoongArchISelLowering.cpp | 2 +-
llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll | 24 +++++++++----------
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ea56c28fe201e..4c858df8f5562 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -651,7 +651,7 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext =
- AnyExt ? DAG.getUNDEF(InputVT) : DAG.getConstant(0, DL, InputVT);
+ AnyExt ? DAG.getFreeze(InputV) : DAG.getConstant(0, DL, InputVT);
InputV = DAG.getBitcast(InputVT, InputV);
InputV = DAG.getNode(VilVLoHi, DL, InputVT, Ext, InputV);
Scale /= 2;
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
index 3b6986aec5b6a..9485df746ff1c 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll
@@ -133,10 +133,10 @@ define void @load_sext_16i8_to_16i32(ptr %ptr, ptr %dst) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
-; CHECK-NEXT: vilvl.h $vr2, $vr0, $vr1
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1
; CHECK-NEXT: vslli.w $vr2, $vr2, 24
; CHECK-NEXT: vsrai.w $vr2, $vr2, 24
-; CHECK-NEXT: vilvh.h $vr1, $vr0, $vr1
+; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1
; CHECK-NEXT: vslli.w $vr1, $vr1, 24
; CHECK-NEXT: vsrai.w $vr1, $vr1, 24
; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
@@ -163,26 +163,26 @@ define void @load_sext_16i8_to_16i64(ptr %ptr, ptr %dst) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0
-; CHECK-NEXT: vilvl.h $vr2, $vr0, $vr1
-; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr2
+; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1
+; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2
; CHECK-NEXT: vslli.d $vr3, $vr3, 56
; CHECK-NEXT: vsrai.d $vr3, $vr3, 56
-; CHECK-NEXT: vilvh.w $vr2, $vr0, $vr2
+; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2
; CHECK-NEXT: vslli.d $vr2, $vr2, 56
; CHECK-NEXT: vsrai.d $vr2, $vr2, 56
-; CHECK-NEXT: vilvh.h $vr1, $vr0, $vr1
-; CHECK-NEXT: vilvl.w $vr4, $vr0, $vr1
+; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1
+; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr1
; CHECK-NEXT: vslli.d $vr4, $vr4, 56
; CHECK-NEXT: vsrai.d $vr4, $vr4, 56
-; CHECK-NEXT: vilvh.w $vr1, $vr0, $vr1
+; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1
; CHECK-NEXT: vslli.d $vr1, $vr1, 56
; CHECK-NEXT: vsrai.d $vr1, $vr1, 56
; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0
; CHECK-NEXT: vilvl.h $vr5, $vr0, $vr0
-; CHECK-NEXT: vilvl.w $vr6, $vr0, $vr5
+; CHECK-NEXT: vilvl.w $vr6, $vr5, $vr5
; CHECK-NEXT: vslli.d $vr6, $vr6, 56
; CHECK-NEXT: vsrai.d $vr6, $vr6, 56
-; CHECK-NEXT: vilvh.w $vr5, $vr0, $vr5
+; CHECK-NEXT: vilvh.w $vr5, $vr5, $vr5
; CHECK-NEXT: vslli.d $vr5, $vr5, 56
; CHECK-NEXT: vsrai.d $vr5, $vr5, 56
; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
@@ -233,10 +233,10 @@ define void @load_sext_8i16_to_8i64(ptr %ptr, ptr %dst) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vilvl.w $vr2, $vr0, $vr1
+; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr1
; CHECK-NEXT: vslli.d $vr2, $vr2, 48
; CHECK-NEXT: vsrai.d $vr2, $vr2, 48
-; CHECK-NEXT: vilvh.w $vr1, $vr0, $vr1
+; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1
; CHECK-NEXT: vslli.d $vr1, $vr1, 48
; CHECK-NEXT: vsrai.d $vr1, $vr1, 48
; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0
>From d33a74a59801a7cdd903757a52d678cea2a0b3a1 Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Wed, 5 Mar 2025 09:41:07 +0800
Subject: [PATCH 3/4] add more checks in isShuffleMaskLegal
---
llvm/lib/Target/LoongArch/LoongArchISelLowering.h | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index c30813fb176a1..002fad0e20759 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -271,6 +271,13 @@ class LoongArchTargetLowering : public TargetLowering {
unsigned *Fast = nullptr) const override;
bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override {
+ if (!VT.isSimple())
+ return false;
+
+ // Not for i1 vectors
+ if (VT.getSimpleVT().getScalarType() == MVT::i1)
+ return false;
+
return isTypeLegal(VT.getSimpleVT());
}
bool shouldConsiderGEPOffsetSplit() const override { return true; }
>From 3f8fd3af2adcd3c4136b1f69facf34ed4c4568ca Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Mon, 10 Mar 2025 11:37:18 +0800
Subject: [PATCH 4/4] replace undef with poison in test
---
.../LoongArch/LoongArchISelLowering.cpp | 3 --
.../CodeGen/LoongArch/lsx/build-vector.ll | 42 +++++++++----------
2 files changed, 21 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4c858df8f5562..f69019bf34f37 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -545,9 +545,6 @@ fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
/// Compute whether each element of a shuffle is zeroable.
///
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
-/// Either it is an undef element in the shuffle mask, the element of the input
-/// referenced is undef, or the element of the input referenced is known to be
-/// zero.
static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
SDValue V2, APInt &KnownUndef,
APInt &KnownZero) {
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index 984b6f3d74866..9c9288c7c8717 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -8,8 +8,8 @@ define void @buildvector_v16i8_splat(ptr %dst, i8 %a0) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %insert = insertelement <16 x i8> undef, i8 %a0, i8 0
- %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+ %insert = insertelement <16 x i8> poison, i8 %a0, i8 0
+ %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
store <16 x i8> %splat, ptr %dst
ret void
}
@@ -21,8 +21,8 @@ define void @buildvector_v8i16_splat(ptr %dst, i16 %a0) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %insert = insertelement <8 x i16> undef, i16 %a0, i8 0
- %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+ %insert = insertelement <8 x i16> poison, i16 %a0, i8 0
+ %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
store <8 x i16> %splat, ptr %dst
ret void
}
@@ -34,8 +34,8 @@ define void @buildvector_v4i32_splat(ptr %dst, i32 %a0) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %insert = insertelement <4 x i32> undef, i32 %a0, i8 0
- %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %insert = insertelement <4 x i32> poison, i32 %a0, i8 0
+ %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
store <4 x i32> %splat, ptr %dst
ret void
}
@@ -47,8 +47,8 @@ define void @buildvector_v2i64_splat(ptr %dst, i64 %a0) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %insert = insertelement <2 x i64> undef, i64 %a0, i8 0
- %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+ %insert = insertelement <2 x i64> poison, i64 %a0, i8 0
+ %splat = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer
store <2 x i64> %splat, ptr %dst
ret void
}
@@ -61,8 +61,8 @@ define void @buildvector_v4f32_splat(ptr %dst, float %a0) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %insert = insertelement <4 x float> undef, float %a0, i8 0
- %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
+ %insert = insertelement <4 x float> poison, float %a0, i8 0
+ %splat = shufflevector <4 x float> %insert, <4 x float> poison, <4 x i32> zeroinitializer
store <4 x float> %splat, ptr %dst
ret void
}
@@ -75,8 +75,8 @@ define void @buildvector_v2f64_splat(ptr %dst, double %a0) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %insert = insertelement <2 x double> undef, double %a0, i8 0
- %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
+ %insert = insertelement <2 x double> poison, double %a0, i8 0
+ %splat = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
store <2 x double> %splat, ptr %dst
ret void
}
@@ -252,7 +252,7 @@ define void @buildvector_v16i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %ins0 = insertelement <16 x i8> undef, i8 %a0, i32 0
+ %ins0 = insertelement <16 x i8> poison, i8 %a0, i32 0
%ins1 = insertelement <16 x i8> %ins0, i8 %a1, i32 1
%ins2 = insertelement <16 x i8> %ins1, i8 %a2, i32 2
%ins3 = insertelement <16 x i8> %ins2, i8 %a3, i32 3
@@ -287,7 +287,7 @@ define void @buildvector_v8i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %ins0 = insertelement <8 x i16> poison, i16 %a0, i32 0
%ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1
%ins2 = insertelement <8 x i16> %ins1, i16 %a2, i32 2
%ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3
@@ -309,7 +309,7 @@ define void @buildvector_v4i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nou
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %ins0 = insertelement <4 x i32> poison, i32 %a0, i32 0
%ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1
%ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
%ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
@@ -325,7 +325,7 @@ define void @buildvector_v2i64(ptr %dst, i64 %a0, i64 %a1) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %ins0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %ins0 = insertelement <2 x i64> poison, i64 %a0, i32 0
%ins1 = insertelement <2 x i64> %ins0, i64 %a1, i32 1
store <2 x i64> %ins1, ptr %dst
ret void
@@ -345,7 +345,7 @@ define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %ins0 = insertelement <4 x float> undef, float %a0, i32 0
+ %ins0 = insertelement <4 x float> poison, float %a0, i32 0
%ins1 = insertelement <4 x float> %ins0, float %a1, i32 1
%ins2 = insertelement <4 x float> %ins1, float %a2, i32 2
%ins3 = insertelement <4 x float> %ins2, float %a3, i32 3
@@ -363,15 +363,15 @@ define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
- %ins0 = insertelement <2 x double> undef, double %a0, i32 0
+ %ins0 = insertelement <2 x double> poison, double %a0, i32 0
%ins1 = insertelement <2 x double> %ins0, double %a1, i32 1
store <2 x double> %ins1, ptr %dst
ret void
}
;; If `isShuffleMaskLegal` returns true, it will lead to an infinite loop.
-define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
+define void @extract1_i32_zext_insert0_i64_poison(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: extract1_i32_zext_insert0_i64_poison:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI24_0)
@@ -383,7 +383,7 @@ define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
%v = load volatile <4 x i32>, ptr %src
%e = extractelement <4 x i32> %v, i32 1
%z = zext i32 %e to i64
- %r = insertelement <2 x i64> undef, i64 %z, i32 0
+ %r = insertelement <2 x i64> poison, i64 %z, i32 0
store <2 x i64> %r, ptr %dst
ret void
}
More information about the llvm-commits
mailing list