[llvm] [RISCV] Merge shuffle sources if lanes are disjoint (PR #119401)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 21:40:23 PST 2024
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/119401
>From e6bda9664357bd248711bf154dc7e57825cc2fbd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 10 Dec 2024 22:27:55 +0800
Subject: [PATCH 1/5] Precommit tests
---
.../RISCV/rvv/fixed-vectors-fp-shuffles.ll | 74 +++++++++++++++++++
.../RISCV/rvv/fixed-vectors-int-shuffles.ll | 74 +++++++++++++++++++
2 files changed, 148 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 0db45ae71bc8ac..6b8d4cc61f60c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -395,3 +395,77 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
%s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x half> %s
}
+
+define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI30_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v20, (a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI30_1)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_1)
+; CHECK-NEXT: vle16.v v22, (a0)
+; CHECK-NEXT: lui a0, 15
+; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
+; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+ %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+ ret <16 x float> %out
+}
+
+define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI31_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: li a0, -272
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+ %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
+ ret <16 x float> %out
+}
+
+define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI32_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v20, (a0)
+; CHECK-NEXT: lui a0, 15
+; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgather.vi v16, v8, 7
+; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+ %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
+ ret <16 x float> %out
+}
+
+define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI33_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: lui a0, 15
+; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vfmv.v.f v12, fa0
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <16 x float> poison, float %v, i32 0
+ %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
+ %out = shufflevector <16 x float> %splat, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+ ret <16 x float> %out
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index ebcea741a2e8bb..83e39adacbe6c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1073,3 +1073,77 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) {
%out = shufflevector <16 x i64> %v1, <16 x i64> %v2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
ret <16 x i64> %out
}
+
+define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI70_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v20, (a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI70_1)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_1)
+; CHECK-NEXT: vle16.v v22, (a0)
+; CHECK-NEXT: lui a0, 15
+; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
+; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+ %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+ ret <16 x i32> %out
+}
+
+define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI71_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: li a0, -272
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
+; CHECK-NEXT: ret
+ %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
+ ret <16 x i32> %out
+}
+
+define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI72_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI72_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v20, (a0)
+; CHECK-NEXT: lui a0, 15
+; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgather.vi v16, v8, 7
+; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t
+; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: ret
+ %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16>
+ ret <16 x i32> %out
+}
+
+define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
+; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI73_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI73_0)
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v16, (a1)
+; CHECK-NEXT: lui a1, 15
+; CHECK-NEXT: addi a1, a1, 240
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <16 x i32> poison, i32 %v, i32 0
+ %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
+ %out = shufflevector <16 x i32> %splat, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
+ ret <16 x i32> %out
+}
>From 862bffd7a01f166561dbb00d8721a4d86595d9ed Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 10 Dec 2024 23:34:29 +0800
Subject: [PATCH 2/5] [RISCV] Merge shuffle sources if lanes are disjoint
In x264, there's a few kernels with shuffles like this:
%41 = add nsw <16 x i32> %39, %40
%42 = sub nsw <16 x i32> %39, %40
%43 = shufflevector <16 x i32> %41, <16 x i32> %42, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
Because this is a complex two-source shuffle, this will get lowered as two vrgather.vvs that are blended together.
vadd.vv v20, v16, v12
vsub.vv v12, v16, v12
vrgatherei16.vv v24, v20, v10
vrgatherei16.vv v24, v12, v16, v0.t
However the indices coming from each source are disjoint, so we can blend the two together and perform a single source shuffle instead:
%41 = add nsw <16 x i32> %39, %40
%42 = sub nsw <16 x i32> %39, %40
%43 = select <0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1> %41, %42
%44 = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 10, i32 14, i32 6, i32 2, i32 9, i32 13, i32 5, i32 1, i32 8, i32 12, i32 4, i32 0>
The select will likely get merged into the preceding instruction, and then we only have to do one vrgather.vv:
vadd.vv v20, v16, v12
vsub.vv v20, v16, v12, v0.t
vrgatherei16.vv v24, v20, v10
This patch bails if either of the sources are a splat however, since that will usually already have some sort of cheaper lowering via vrgather.vi.
This improves performance on 525.x264_r by 4.12% with -O3 -flto -march=rva22u64_v on the spacemit-x60: https://lnt.lukelau.me/db_default/v4/nts/71?compare_to=70
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 71 +
.../RISCV/rvv/fixed-vectors-fp-shuffles.ll | 35 +-
.../RISCV/rvv/fixed-vectors-int-shuffles.ll | 66 +-
.../rvv/fixed-vectors-interleaved-access.ll | 1350 +++++++++--------
.../rvv/fixed-vectors-shuffle-deinterleave.ll | 67 +-
5 files changed, 859 insertions(+), 730 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index bf919122452628..48b3b6e7ca20d6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5197,6 +5197,67 @@ static bool isCompressMask(ArrayRef<int> Mask) {
return true;
}
+/// Given a shuffle where the indices are disjoint between the two sources,
+/// e.g.:
+///
+/// t2:v4i8 = vector_shuffle t0:v4i8, t1:v4i8, <2, 7, 1, 4>
+///
+/// Merge the two sources into one and do a single source shuffle:
+///
+/// t2:v4i8 = vselect t1:v4i8, t0:v4i8, <0, 1, 0, 1>
+/// t3:v4i8 = vector_shuffle t2:v4i8, undef, <2, 3, 1, 0>
+///
+/// A vselect will either be merged into a masked instruction or be lowered as a
+/// vmerge.vvm, which is cheaper than a vrgather.vv.
+static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
+ SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ MVT VT = SVN->getSimpleValueType(0);
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDLoc DL(SVN);
+
+ const ArrayRef<int> Mask = SVN->getMask();
+
+ // Work out which source each lane will come from.
+ SmallVector<int, 16> Srcs(Mask.size(), -1);
+
+ for (int Idx : Mask) {
+ if (Idx == -1)
+ continue;
+ unsigned SrcIdx = Idx % Mask.size();
+ int Src = (uint32_t)Idx < Mask.size() ? 0 : 1;
+ if (Srcs[SrcIdx] == -1)
+ // Mark this source as using this lane.
+ Srcs[SrcIdx] = Src;
+ else if (Srcs[SrcIdx] != Src)
+ // The other source is using this lane: not disjoint.
+ return SDValue();
+ }
+
+ SmallVector<SDValue> SelectMaskVals;
+ for (int Lane : Srcs) {
+ if (Lane == -1)
+ SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
+ else
+ SelectMaskVals.push_back(DAG.getConstant(Lane, DL, XLenVT));
+ }
+ MVT MaskVT = VT.changeVectorElementType(MVT::i1);
+ SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
+ SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
+ SVN->getOperand(1), SVN->getOperand(0));
+
+ // Move all indices relative to the first source.
+ SmallVector<int> NewMask(Mask.size());
+ for (unsigned I = 0; I < Mask.size(); I++) {
+ if (Mask[I] == -1)
+ NewMask[I] = -1;
+ else
+ NewMask[I] = Mask[I] % Mask.size();
+ }
+
+ return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
+}
+
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue V1 = Op.getOperand(0);
@@ -5540,6 +5601,16 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
}
+ // If the mask indices are disjoint between the two sources, we can lower it
+ // as a vselect + a single source vrgather.vv. Don't do this if the operands
+ // will be splatted since they will be lowered to something cheaper like
+ // vrgather.vi anyway.
+ if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
+ !ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
+ !ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT))
+ if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
+ return V;
+
// Try to pick a profitable operand order.
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 6b8d4cc61f60c7..4cb53d2a9cc83d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -29,10 +29,10 @@ define <4 x half> @shuffle_v4f16(<4 x half> %x, <4 x half> %y) {
define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
; CHECK-LABEL: shuffle_v8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, -20
+; CHECK-NEXT: li a0, 19
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
ret <8 x float> %s
@@ -401,17 +401,16 @@ define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI30_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT: vle16.v v20, (a0)
-; CHECK-NEXT: lui a0, %hi(.LCPI30_1)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_1)
-; CHECK-NEXT: vle16.v v22, (a0)
-; CHECK-NEXT: lui a0, 15
-; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: lui a0, 5
+; CHECK-NEXT: addi a0, a0, 1365
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
-; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
-; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v18, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
; CHECK-NEXT: ret
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
ret <16 x float> %out
@@ -422,11 +421,15 @@ define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI31_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT: vle16.v v16, (a0)
-; CHECK-NEXT: li a0, -272
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: li a0, -304
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v18, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
; CHECK-NEXT: ret
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
ret <16 x float> %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 83e39adacbe6c4..df06acc1e6598c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -16,10 +16,10 @@ define <4 x i16> @shuffle_v4i16(<4 x i16> %x, <4 x i16> %y) {
define <8 x i32> @shuffle_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: shuffle_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 203
+; CHECK-NEXT: li a0, 52
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
ret <8 x i32> %s
@@ -451,21 +451,14 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI26_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 6
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: lui a0, 8256
-; CHECK-NEXT: addi a0, a0, 2
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v12, a0
-; CHECK-NEXT: li a0, 98
-; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vi v11, v10, 5
+; CHECK-NEXT: vle8.v v10, (a0)
+; CHECK-NEXT: li a0, 65
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vrgather.vv v10, v8, v12
-; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
+; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
ret <8 x i8> %shuff
@@ -693,12 +686,12 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v10, (a0)
-; CHECK-NEXT: li a0, -22
+; CHECK-NEXT: li a0, 171
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
+; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
ret <8 x i8> %res
@@ -709,9 +702,9 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: shuffle_v8i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.v.i v0, -13
+; CHECK-NEXT: vmv.v.i v0, 12
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %s
@@ -1079,17 +1072,16 @@ define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI70_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT: vle16.v v20, (a0)
-; CHECK-NEXT: lui a0, %hi(.LCPI70_1)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_1)
-; CHECK-NEXT: vle16.v v22, (a0)
-; CHECK-NEXT: lui a0, 15
-; CHECK-NEXT: addi a0, a0, 240
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: lui a0, 5
+; CHECK-NEXT: addi a0, a0, 1365
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
-; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
-; CHECK-NEXT: vmv.v.v v8, v16
+; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v18, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
ret <16 x i32> %out
@@ -1100,11 +1092,15 @@ define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32>
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI71_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT: vle16.v v16, (a0)
-; CHECK-NEXT: li a0, -272
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: li a0, -304
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsext.vf2 v18, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
ret <16 x i32> %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 8833634be1a0ed..d4ae952325d6b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -183,406 +183,499 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a3, a2, 6
-; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: li a3, 96
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb
-; RV32-NEXT: addi a3, a1, 256
-; RV32-NEXT: addi a4, a1, 128
+; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb
+; RV32-NEXT: addi a3, a1, 128
+; RV32-NEXT: addi a4, a1, 256
; RV32-NEXT: li a2, 32
-; RV32-NEXT: lui a5, 12291
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT: vle32.v v24, (a1)
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a6, 41
-; RV32-NEXT: mul a1, a1, a6
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, %hi(.LCPI8_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0)
-; RV32-NEXT: vle16.v v4, (a1)
-; RV32-NEXT: lui a1, 1
-; RV32-NEXT: addi a5, a5, 3
+; RV32-NEXT: li a5, 48
+; RV32-NEXT: lui a6, 196656
+; RV32-NEXT: lui a7, %hi(.LCPI8_1)
+; RV32-NEXT: addi a7, a7, %lo(.LCPI8_1)
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vle32.v v8, (a4)
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a6, 57
-; RV32-NEXT: mul a4, a4, a6
+; RV32-NEXT: li t0, 88
+; RV32-NEXT: mul a4, a4, t0
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, a1, -64
-; RV32-NEXT: vle32.v v16, (a3)
-; RV32-NEXT: vmv.s.x v3, a5
-; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vmv.s.x v0, a5
+; RV32-NEXT: vle32.v v24, (a3)
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 72
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vle32.v v16, (a1)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, a6, 48
+; RV32-NEXT: vle16.v v4, (a7)
+; RV32-NEXT: vmv.s.x v3, a1
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 13
+; RV32-NEXT: li a3, 80
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vcompress.vm v8, v24, v3
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 52
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v3
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v16, v8, v24, v0
+; RV32-NEXT: vrgatherei16.vv v8, v16, v4
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 56
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: li a1, 192
+; RV32-NEXT: lui a3, 786624
+; RV32-NEXT: lui a4, %hi(.LCPI8_3)
+; RV32-NEXT: addi a4, a4, %lo(.LCPI8_3)
+; RV32-NEXT: addi a3, a3, 192
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vle16.v v16, (a4)
+; RV32-NEXT: vmv.s.x v20, a3
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 57
+; RV32-NEXT: li a3, 88
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t
-; RV32-NEXT: lui a1, 12
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 49
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 80
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vslideup.vi v12, v16, 4
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a4, a3, 4
-; RV32-NEXT: add a3, a4, a3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v24, v16, 16
+; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 25
+; RV32-NEXT: li a3, 48
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT: vslideup.vi v12, v24, 10, v0.t
+; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v20
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a3, a1, 5
-; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: slli a1, a1, 6
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v12, v8
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 21
+; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, 49164
-; RV32-NEXT: lui a3, %hi(.LCPI8_1)
-; RV32-NEXT: addi a3, a3, %lo(.LCPI8_1)
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT: vle16.v v28, (a3)
-; RV32-NEXT: addi a1, a1, 12
-; RV32-NEXT: vmv.s.x v20, a1
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v24, v8, v24, v0
+; RV32-NEXT: vrgatherei16.vv v8, v24, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 41
+; RV32-NEXT: li a3, 40
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vcompress.vm v8, v0, v20
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: li a1, 768
+; RV32-NEXT: lui a3, 768
+; RV32-NEXT: li a4, 3
+; RV32-NEXT: lui a5, 3073
+; RV32-NEXT: lui a6, 3
+; RV32-NEXT: lui a7, 12291
+; RV32-NEXT: lui t0, 12
+; RV32-NEXT: addi a3, a3, 768
+; RV32-NEXT: slli a4, a4, 10
+; RV32-NEXT: addi a5, a5, -1024
+; RV32-NEXT: addi a6, a6, 3
+; RV32-NEXT: addi a7, a7, 3
+; RV32-NEXT: addi t0, t0, 12
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vmv.s.x v12, a3
+; RV32-NEXT: vmv.s.x v7, a4
+; RV32-NEXT: vmv.s.x v1, a5
+; RV32-NEXT: vmv.s.x v3, a6
+; RV32-NEXT: vmv.s.x v2, a7
+; RV32-NEXT: vmv.s.x v8, t0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 13
+; RV32-NEXT: li a3, 80
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vmv4r.v v8, v24
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 57
+; RV32-NEXT: li a3, 88
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v12
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 49
+; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT: vslideup.vi v12, v16, 2
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 25
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 88
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vslideup.vi v12, v24, 8, v0.t
-; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v12, v8
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 13
+; RV32-NEXT: li a3, 80
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, 196656
-; RV32-NEXT: lui a3, %hi(.LCPI8_2)
-; RV32-NEXT: addi a3, a3, %lo(.LCPI8_2)
-; RV32-NEXT: li a4, 960
-; RV32-NEXT: lui a5, %hi(.LCPI8_3)
-; RV32-NEXT: addi a5, a5, %lo(.LCPI8_3)
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vmv.s.x v0, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT: vle16.v v4, (a3)
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vle16.v v8, (a5)
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a4, a3, 3
-; RV32-NEXT: add a3, a4, a3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs2r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vmv.s.x v22, a1
+; RV32-NEXT: vmerge.vvm v4, v16, v24, v0
+; RV32-NEXT: vmv1r.v v0, v1
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 41
+; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT: vcompress.vm v8, v24, v22
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 28
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 57
+; RV32-NEXT: li a3, 88
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 49
+; RV32-NEXT: li a3, 80
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmerge.vvm v16, v16, v24, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 36
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v2
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a3, a1, 3
-; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: li a3, 72
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT: vrgatherei16.vv v12, v0, v16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 25
+; RV32-NEXT: li a3, 20
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a3, a1, 5
-; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: li a3, 88
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 80
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vslideup.vi v12, v16, 6, v0.t
-; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v12, v8
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a3, a1, 3
-; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: li a3, 88
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, 786624
-; RV32-NEXT: lui a3, %hi(.LCPI8_4)
-; RV32-NEXT: addi a3, a3, %lo(.LCPI8_4)
-; RV32-NEXT: lui a4, %hi(.LCPI8_5)
-; RV32-NEXT: addi a4, a4, %lo(.LCPI8_5)
-; RV32-NEXT: addi a1, a1, 192
+; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, %hi(.LCPI8_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0)
+; RV32-NEXT: lui a3, %hi(.LCPI8_5)
+; RV32-NEXT: addi a3, a3, %lo(.LCPI8_5)
+; RV32-NEXT: lui a4, 49164
; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT: vle16.v v8, (a3)
+; RV32-NEXT: vle16.v v24, (a3)
+; RV32-NEXT: addi a4, a4, 12
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vle16.v v12, (a4)
-; RV32-NEXT: vmv.s.x v14, a1
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT: vcompress.vm v16, v24, v14
+; RV32-NEXT: vle16.v v2, (a1)
+; RV32-NEXT: vmv.s.x v0, a4
; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vrgatherei16.vv v16, v8, v24
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 57
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 49
+; RV32-NEXT: li a3, 52
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT: vrgatherei16.vv v4, v0, v12
+; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v16, v12, v2
+; RV32-NEXT: lui a1, %hi(.LCPI8_2)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2)
+; RV32-NEXT: vle16.v v12, (a1)
+; RV32-NEXT: lui a1, %hi(.LCPI8_4)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4)
+; RV32-NEXT: vle16.v v14, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 25
+; RV32-NEXT: li a3, 56
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v16, v24
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a3, a1, 5
-; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: li a3, 80
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vslideup.vi v4, v8, 4, v0.t
-; RV32-NEXT: lui a1, 768
-; RV32-NEXT: lui a3, %hi(.LCPI8_6)
-; RV32-NEXT: addi a3, a3, %lo(.LCPI8_6)
-; RV32-NEXT: li a4, 1008
-; RV32-NEXT: addi a1, a1, 768
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT: vle16.v v8, (a3)
-; RV32-NEXT: vmv.s.x v1, a4
-; RV32-NEXT: vmv.s.x v12, a1
+; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 41
+; RV32-NEXT: li a3, 48
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vcompress.vm v24, v16, v12
-; RV32-NEXT: vmv1r.v v0, v1
+; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v24, v16, v12
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 57
+; RV32-NEXT: li a3, 40
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v24, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 25
+; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, %hi(.LCPI8_7)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7)
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT: vle16.v v8, (a1)
-; RV32-NEXT: lui a1, 15
-; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v20, v16, v14
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v20, v8
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 49
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: lui a1, %hi(.LCPI8_6)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6)
+; RV32-NEXT: lui a3, %hi(.LCPI8_7)
+; RV32-NEXT: addi a3, a3, %lo(.LCPI8_7)
+; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; RV32-NEXT: vle16.v v20, (a3)
+; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT: vle16.v v16, (a1)
+; RV32-NEXT: lui a1, %hi(.LCPI8_9)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9)
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vle16.v v0, (a1)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 28
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vslideup.vi v20, v16, 6
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vrgatherei16.vv v8, v24, v20
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v20, v4, v16
+; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v20, v8
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a3, a1, 5
-; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: li a3, 20
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v20, v24, v8, v0.t
-; RV32-NEXT: lui a1, 3073
-; RV32-NEXT: lui a3, %hi(.LCPI8_8)
-; RV32-NEXT: addi a3, a3, %lo(.LCPI8_8)
-; RV32-NEXT: lui a4, %hi(.LCPI8_9)
-; RV32-NEXT: addi a4, a4, %lo(.LCPI8_9)
-; RV32-NEXT: addi a1, a1, -1024
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vrgatherei16.vv v8, v24, v0
+; RV32-NEXT: lui a1, %hi(.LCPI8_8)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_8)
+; RV32-NEXT: lui a3, %hi(.LCPI8_10)
+; RV32-NEXT: addi a3, a3, %lo(.LCPI8_10)
+; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT: vle16.v v12, (a1)
+; RV32-NEXT: lui a1, %hi(.LCPI8_11)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI8_11)
; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT: vle16.v v16, (a3)
+; RV32-NEXT: vle16.v v16, (a1)
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vle16.v v2, (a4)
-; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vle16.v v14, (a3)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 41
+; RV32-NEXT: li a3, 52
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT: vcompress.vm v8, v24, v0
-; RV32-NEXT: vmv1r.v v0, v1
+; RV32-NEXT: vs2r.v v14, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 57
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: li a3, 36
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vrgatherei16.vv v0, v4, v12
+; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v0, v8
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vrgatherei16.vv v8, v24, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 4
-; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: li a2, 56
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 5
-; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: li a2, 88
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT: vrgatherei16.vv v12, v24, v2, v0.t
+; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 52
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v4, v24
+; RV32-NEXT: vl2r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v8, v16, v12
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 25
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vmv.v.v v20, v24
-; RV32-NEXT: vmv.v.v v12, v8
+; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v8, v24
; RV32-NEXT: addi a1, a0, 320
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v12, (a1)
+; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: addi a1, a0, 256
-; RV32-NEXT: vse32.v v20, (a1)
+; RV32-NEXT: vse32.v v0, (a1)
; RV32-NEXT: addi a1, a0, 192
-; RV32-NEXT: vse32.v v4, (a1)
+; RV32-NEXT: vse32.v v20, (a1)
; RV32-NEXT: addi a1, a0, 128
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a3, a2, 3
-; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: slli a2, a2, 6
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: addi a1, a0, 64
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 13
+; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 21
+; RV32-NEXT: li a2, 80
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vse32.v v8, (a0)
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 6
-; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: li a1, 96
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -594,479 +687,452 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 67
+; RV64-NEXT: li a3, 92
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc3, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 67 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdc, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 92 * vlenb
; RV64-NEXT: addi a2, a1, 128
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vle64.v v8, (a1)
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: li a4, 59
-; RV64-NEXT: mul a3, a3, a4
-; RV64-NEXT: add a3, sp, a3
-; RV64-NEXT: addi a3, a3, 16
-; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV64-NEXT: addi a1, a1, 256
+; RV64-NEXT: addi a5, a1, 256
; RV64-NEXT: li a3, 128
-; RV64-NEXT: vle64.v v24, (a1)
-; RV64-NEXT: lui a1, 1
-; RV64-NEXT: vid.v v8
+; RV64-NEXT: lui a4, 4
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vle64.v v0, (a5)
+; RV64-NEXT: lui a5, 16
+; RV64-NEXT: addi a5, a5, 7
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.v.x v8, a5
+; RV64-NEXT: lui a5, %hi(.LCPI8_0)
+; RV64-NEXT: addi a5, a5, %lo(.LCPI8_0)
+; RV64-NEXT: vmv.v.i v9, 6
+; RV64-NEXT: addi a4, a4, 260
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vrgather.vi v12, v0, 4
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 88
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 16
+; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v12, v0, 5
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 72
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 16
+; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v12, v0, v9
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 56
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 16
+; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v12, v0, v8
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 76
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 16
+; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v8, v0, 2
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 68
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 16
+; RV64-NEXT: vs4r.v v8, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v8, v0, 3
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: slli a6, a6, 6
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 16
+; RV64-NEXT: vs4r.v v8, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v0, 8
; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: li a4, 30
-; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: slli a3, a3, 3
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 16
; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV64-NEXT: li a3, 6
-; RV64-NEXT: vmul.vx v6, v8, a3
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v16, (a2)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vle16.v v12, (a5)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs2r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv.s.x v6, a4
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 88
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: vrgather.vi v12, v8, 2, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 88
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v6
+; RV64-NEXT: vmv8r.v v8, v24
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vmerge.vvm v24, v24, v16, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vrgatherei16.vv v0, v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: lui a2, %hi(.LCPI8_1)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI8_1)
+; RV64-NEXT: addi a1, a1, 520
+; RV64-NEXT: vle16.v v4, (a2)
+; RV64-NEXT: vmv.s.x v6, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl1r.v v3, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 72
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: vrgather.vi v24, v16, 3, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 72
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v6
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 48
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT: vmv8r.v v16, v8
+; RV64-NEXT: vrgatherei16.vv v8, v24, v4
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: li a1, 1040
+; RV64-NEXT: lui a2, 1
+; RV64-NEXT: addi a3, a2, -2016
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: vmv.s.x v2, a3
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 48
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a3, 56
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vrgather.vi v8, v24, 4
-; RV64-NEXT: csrr a4, vlenb
-; RV64-NEXT: li a5, 22
-; RV64-NEXT: mul a4, a4, a5
-; RV64-NEXT: add a4, sp, a4
-; RV64-NEXT: addi a4, a4, 16
-; RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v24, 8
-; RV64-NEXT: csrr a4, vlenb
-; RV64-NEXT: li a5, 39
-; RV64-NEXT: mul a4, a4, a5
-; RV64-NEXT: add a4, sp, a4
-; RV64-NEXT: addi a4, a4, 16
-; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 80
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v8, v16, 2, v0.t
-; RV64-NEXT: vmv.v.v v20, v8
-; RV64-NEXT: vmv.s.x v8, a3
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: li a4, 55
-; RV64-NEXT: mul a3, a3, a4
-; RV64-NEXT: add a3, sp, a3
-; RV64-NEXT: addi a3, a3, 16
-; RV64-NEXT: vs1r.v v8, (a3) # Unknown-size Folded Spill
-; RV64-NEXT: addi a3, a1, 65
+; RV64-NEXT: vrgather.vi v4, v24, 4, v0.t
+; RV64-NEXT: vmv1r.v v0, v2
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v8, (a2)
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a4, 47
-; RV64-NEXT: mul a2, a2, a4
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vmv.s.x v16, a3
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 35
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs1r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT: vadd.vi v16, v6, -16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 5
-; RV64-NEXT: sub a2, a3, a2
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vmv2r.v v18, v6
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 12
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 59
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 35
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT: vcompress.vm v24, v0, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 5
-; RV64-NEXT: sub a2, a3, a2
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v24, v8, v16, v0.t
-; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v20, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 18
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 22
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v8, v24, 5
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 30
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 39
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vrgather.vi v8, v24, 3, v0.t
-; RV64-NEXT: vmv.v.v v20, v8
-; RV64-NEXT: lui a2, 2
-; RV64-NEXT: addi a2, a2, 130
-; RV64-NEXT: vmv.s.x v8, a2
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vadd.vi v16, v18, -15
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 59
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT: vcompress.vm v24, v0, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 47
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v24, v8, v16, v0.t
-; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v20, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 14
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: lui a2, 16
-; RV64-NEXT: addi a2, a2, 7
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.i v8, 6
-; RV64-NEXT: vmv.v.x v9, a2
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 22
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vrgatherei16.vv v12, v16, v8
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vrgatherei16.vv v12, v16, v9
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 5
-; RV64-NEXT: sub a2, a3, a2
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vmv4r.v v8, v16
-; RV64-NEXT: vrgather.vi v12, v16, 2
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 35
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vrgather.vi v12, v16, 3
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: lui a2, 4
+; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a3, 24
-; RV64-NEXT: addi a2, a2, 260
-; RV64-NEXT: vmv.s.x v0, a3
-; RV64-NEXT: addi a3, sp, 16
-; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV64-NEXT: vmv.s.x v24, a2
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 12
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl2r.v v2, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vadd.vi v6, v2, -14
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 59
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT: vcompress.vm v8, v16, v24
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 47
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v8, v16, v6, v0.t
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 22
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 30
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v1, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vmv1r.v v0, v1
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 39
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v28, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v28, v24, 4, v0.t
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v28, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: lui a2, 8
-; RV64-NEXT: addi a2, a2, 520
-; RV64-NEXT: vmv.s.x v7, a2
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vadd.vi v4, v2, -13
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 59
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT: vcompress.vm v8, v24, v7
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v8, v16, v4, v0.t
-; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v1
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 39
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 5
-; RV64-NEXT: sub a2, a3, a2
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 76
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 80
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv4r.v v8, v24
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v8, v16, 5, v0.t
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 5
-; RV64-NEXT: sub a2, a3, a2
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: lui a2, 96
+; RV64-NEXT: vrgather.vi v12, v24, 5, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 76
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 96
; RV64-NEXT: li a3, 192
-; RV64-NEXT: vmv.s.x v1, a3
+; RV64-NEXT: vmv.s.x v3, a3
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v8, a2
-; RV64-NEXT: vmv1r.v v0, v1
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 35
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v12, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vmv.v.x v12, a1
+; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 68
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgatherei16.vv v12, v16, v8, v0.t
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 35
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: li a2, 1040
-; RV64-NEXT: li a3, 28
-; RV64-NEXT: vmv.s.x v20, a2
-; RV64-NEXT: vmv.s.x v0, a3
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 30
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vadd.vi v22, v2, -12
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 59
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT: vcompress.vm v8, v24, v20
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 47
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v8, v24, v22, v0.t
-; RV64-NEXT: lui a2, 112
-; RV64-NEXT: addi a2, a2, 1
+; RV64-NEXT: vrgatherei16.vv v24, v8, v12, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 68
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 112
+; RV64-NEXT: addi a2, a2, 65
+; RV64-NEXT: addi a1, a1, 1
+; RV64-NEXT: vmv.s.x v0, a2
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v12, a2
-; RV64-NEXT: vmv1r.v v0, v1
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v4, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgatherei16.vv v4, v16, v12, v0.t
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v12, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 22
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v12, v16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: addi a1, a1, -2016
-; RV64-NEXT: vmv.s.x v12, a1
+; RV64-NEXT: vmv.v.x v2, a1
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 59
+; RV64-NEXT: li a2, 48
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vcompress.vm v16, v24, v12
-; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT: vadd.vi v12, v2, -11
+; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 30
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: vrgatherei16.vv v24, v16, v2, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 47
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, %hi(.LCPI8_2)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2)
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vle16.v v2, (a1)
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: addi a1, a1, 130
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vrgatherei16.vv v24, v16, v2
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 56
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT: vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, %hi(.LCPI8_3)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3)
+; RV64-NEXT: vle16.v v12, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 88
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma
+; RV64-NEXT: vmv.v.v v8, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 88
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 5
-; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: li a2, 72
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv.v.v v0, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 56
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT: vmv.v.v v4, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 56
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vrgatherei16.vv v24, v16, v12
+; RV64-NEXT: lui a1, %hi(.LCPI8_4)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4)
+; RV64-NEXT: vle16.v v8, (a1)
+; RV64-NEXT: lui a1, %hi(.LCPI8_5)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5)
+; RV64-NEXT: vle16.v v6, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 76
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
; RV64-NEXT: vmv.v.v v12, v24
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 35
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vrgatherei16.vv v24, v16, v8
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 68
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmv.v.v v20, v8
-; RV64-NEXT: vmv4r.v v8, v4
-; RV64-NEXT: vmv.v.v v8, v16
-; RV64-NEXT: addi a1, a0, 256
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v20, (a1)
-; RV64-NEXT: addi a1, a0, 320
-; RV64-NEXT: vse64.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 192
-; RV64-NEXT: vse64.v v12, (a1)
+; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT: vmv.v.v v8, v24
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vrgatherei16.vv v24, v16, v6
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT: vmv.v.v v16, v24
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 55
+; RV64-NEXT: li a3, 56
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v20, (a2) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vse64.v v20, (a1)
+; RV64-NEXT: addi a1, a0, 320
+; RV64-NEXT: vse64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 256
; RV64-NEXT: vse64.v v8, (a1)
+; RV64-NEXT: addi a1, a0, 192
+; RV64-NEXT: vse64.v v12, (a1)
; RV64-NEXT: addi a1, a0, 64
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 14
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vse64.v v8, (a1)
+; RV64-NEXT: vse64.v v0, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 18
+; RV64-NEXT: li a2, 88
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a1, 67
+; RV64-NEXT: li a1, 92
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 188ef8fe35a4a1..8de1ed908f0227 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -11,21 +11,18 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.v.i v9, -8
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: li a0, 3
-; CHECK-NEXT: vmadd.vx v10, a0, v9
-; CHECK-NEXT: li a0, 73
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: li a0, 56
+; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: li a0, 146
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vcompress.vm v11, v8, v9
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 8
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vrgather.vv v11, v8, v10, v0.t
-; CHECK-NEXT: vse8.v v11, (a1)
+; CHECK-NEXT: vslidedown.vi v10, v8, 8
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vrgather.vv v10, v8, v9
+; CHECK-NEXT: vse8.v v10, (a1)
; CHECK-NEXT: ret
entry:
%0 = load <16 x i8>, ptr %in, align 1
@@ -39,21 +36,18 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a0, 146
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: li a0, 24
+; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vcompress.vm v10, v8, v9
-; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 8
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vsrl.vi v9, v8, 8
-; CHECK-NEXT: vsll.vi v8, v8, 8
+; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: li a0, 36
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vor.vv v8, v8, v9
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 8
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vrgather.vv v10, v8, v9
+; CHECK-NEXT: vse8.v v10, (a1)
; CHECK-NEXT: ret
entry:
%0 = load <16 x i8>, ptr %in, align 1
@@ -105,20 +99,19 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.v.i v9, -8
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: li a0, 5
-; CHECK-NEXT: vmadd.vx v10, a0, v9
-; CHECK-NEXT: li a0, 33
-; CHECK-NEXT: vmv.v.i v0, 12
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vcompress.vm v11, v8, v9
+; CHECK-NEXT: li a0, 132
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: lui a0, 28704
+; CHECK-NEXT: addi a0, a0, 1280
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 8
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vrgather.vv v11, v8, v10, v0.t
-; CHECK-NEXT: vse8.v v11, (a1)
+; CHECK-NEXT: vslidedown.vi v9, v8, 8
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vrgather.vv v10, v8, v9
+; CHECK-NEXT: vse8.v v10, (a1)
; CHECK-NEXT: ret
entry:
%0 = load <16 x i8>, ptr %in, align 1
>From 99e1f12dee0fe4284b8bcc3863007fd0d4f69729 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Dec 2024 11:26:44 +0800
Subject: [PATCH 3/5] Swap operand order to match generic case
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 +-
.../RISCV/rvv/fixed-vectors-fp-shuffles.ll | 14 +-
.../RISCV/rvv/fixed-vectors-int-shuffles.ll | 26 +-
.../rvv/fixed-vectors-interleaved-access.ll | 546 +++++++++---------
.../rvv/fixed-vectors-shuffle-deinterleave.ll | 12 +-
5 files changed, 297 insertions(+), 305 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 48b3b6e7ca20d6..25ec751992ac34 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5239,12 +5239,12 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
if (Lane == -1)
SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
else
- SelectMaskVals.push_back(DAG.getConstant(Lane, DL, XLenVT));
+ SelectMaskVals.push_back(DAG.getConstant(Lane ? 0 : 1, DL, XLenVT));
}
MVT MaskVT = VT.changeVectorElementType(MVT::i1);
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
- SVN->getOperand(1), SVN->getOperand(0));
+ SVN->getOperand(0), SVN->getOperand(1));
// Move all indices relative to the first source.
SmallVector<int> NewMask(Mask.size());
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 4cb53d2a9cc83d..6e40bf64e0d782 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -29,10 +29,10 @@ define <4 x half> @shuffle_v4f16(<4 x half> %x, <4 x half> %y) {
define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
; CHECK-LABEL: shuffle_v8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 19
+; CHECK-NEXT: li a0, -20
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
ret <8 x float> %s
@@ -403,10 +403,10 @@ define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: lui a0, 5
-; CHECK-NEXT: addi a0, a0, 1365
+; CHECK-NEXT: lui a0, 11
+; CHECK-NEXT: addi a0, a0, -1366
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v18, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
@@ -423,9 +423,9 @@ define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: li a0, -304
+; CHECK-NEXT: li a0, 271
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v18, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index df06acc1e6598c..c759152a730a12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -16,10 +16,10 @@ define <4 x i16> @shuffle_v4i16(<4 x i16> %x, <4 x i16> %y) {
define <8 x i32> @shuffle_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: shuffle_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 52
+; CHECK-NEXT: li a0, 203
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
ret <8 x i32> %s
@@ -455,9 +455,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v10, (a0)
-; CHECK-NEXT: li a0, 65
+; CHECK-NEXT: li a0, 20
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
+; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
@@ -688,9 +688,9 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v10, (a0)
-; CHECK-NEXT: li a0, 171
+; CHECK-NEXT: li a0, 84
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
+; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
@@ -702,9 +702,9 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: shuffle_v8i32_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vmv.v.i v0, -13
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %s
@@ -1074,10 +1074,10 @@ define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: lui a0, 5
-; CHECK-NEXT: addi a0, a0, 1365
+; CHECK-NEXT: lui a0, 11
+; CHECK-NEXT: addi a0, a0, -1366
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v18, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
@@ -1094,9 +1094,9 @@ define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32>
; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: li a0, -304
+; CHECK-NEXT: li a0, 271
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
+; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v18, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index d4ae952325d6b3..67d649902b022a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -190,8 +190,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a3, a1, 128
; RV32-NEXT: addi a4, a1, 256
; RV32-NEXT: li a2, 32
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: lui a6, 196656
+; RV32-NEXT: li a5, 768
+; RV32-NEXT: lui a6, 12291
; RV32-NEXT: lui a7, %hi(.LCPI8_1)
; RV32-NEXT: addi a7, a7, %lo(.LCPI8_1)
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -203,22 +203,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vmv.s.x v0, a5
-; RV32-NEXT: vle32.v v24, (a3)
-; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: vle32.v v24, (a1)
+; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a4, 72
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vle32.v v16, (a1)
+; RV32-NEXT: mul a1, a1, a4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vle32.v v16, (a3)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 6
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, a6, 48
+; RV32-NEXT: addi a6, a6, 3
; RV32-NEXT: vle16.v v4, (a7)
-; RV32-NEXT: vmv.s.x v3, a1
+; RV32-NEXT: vmv.s.x v3, a6
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 16
; RV32-NEXT: csrr a1, vlenb
@@ -228,7 +228,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: vmerge.vvm v8, v16, v8, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 52
; RV32-NEXT: mul a1, a1, a3
@@ -250,11 +250,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: li a1, 192
-; RV32-NEXT: lui a3, 786624
+; RV32-NEXT: li a1, 3
+; RV32-NEXT: lui a3, 49164
; RV32-NEXT: lui a4, %hi(.LCPI8_3)
; RV32-NEXT: addi a4, a4, %lo(.LCPI8_3)
-; RV32-NEXT: addi a3, a3, 192
+; RV32-NEXT: slli a1, a1, 10
+; RV32-NEXT: addi a3, a3, 12
; RV32-NEXT: vmv.s.x v0, a1
; RV32-NEXT: vle16.v v16, (a4)
; RV32-NEXT: vmv.s.x v20, a3
@@ -271,7 +272,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 48
; RV32-NEXT: mul a1, a1, a3
@@ -299,28 +300,27 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: li a1, 768
-; RV32-NEXT: lui a3, 768
-; RV32-NEXT: li a4, 3
-; RV32-NEXT: lui a5, 3073
-; RV32-NEXT: lui a6, 3
-; RV32-NEXT: lui a7, 12291
-; RV32-NEXT: lui t0, 12
-; RV32-NEXT: addi a3, a3, 768
-; RV32-NEXT: slli a4, a4, 10
-; RV32-NEXT: addi a5, a5, -1024
-; RV32-NEXT: addi a6, a6, 3
-; RV32-NEXT: addi a7, a7, 3
-; RV32-NEXT: addi t0, t0, 12
+; RV32-NEXT: lui a1, 3
+; RV32-NEXT: lui a3, 196656
+; RV32-NEXT: lui a4, 12
+; RV32-NEXT: lui a5, 786624
+; RV32-NEXT: li a6, 48
+; RV32-NEXT: lui a7, 768
+; RV32-NEXT: li t0, 192
+; RV32-NEXT: addi a1, a1, 3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: addi a4, a4, 12
+; RV32-NEXT: addi a5, a5, 192
+; RV32-NEXT: addi a7, a7, 768
+; RV32-NEXT: vmv.s.x v1, a6
+; RV32-NEXT: vmv.s.x v8, t0
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs1r.v v8, (a6) # Unknown-size Folded Spill
; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vmv.s.x v12, a3
+; RV32-NEXT: vmv.s.x v14, a3
; RV32-NEXT: vmv.s.x v7, a4
-; RV32-NEXT: vmv.s.x v1, a5
-; RV32-NEXT: vmv.s.x v3, a6
+; RV32-NEXT: vmv.s.x v3, a5
; RV32-NEXT: vmv.s.x v2, a7
-; RV32-NEXT: vmv.s.x v8, t0
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 80
; RV32-NEXT: mul a1, a1, a3
@@ -335,13 +335,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v12
+; RV32-NEXT: vmv1r.v v0, v14
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
@@ -374,8 +374,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v4, v16, v24, v0
-; RV32-NEXT: vmv1r.v v0, v1
+; RV32-NEXT: vmerge.vvm v4, v24, v16, v0
+; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 72
; RV32-NEXT: mul a1, a1, a3
@@ -390,7 +390,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v3
+; RV32-NEXT: vmv1r.v v0, v1
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 88
; RV32-NEXT: mul a1, a1, a3
@@ -404,7 +404,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v16, v16, v24, v0
+; RV32-NEXT: vmerge.vvm v16, v24, v16, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 36
; RV32-NEXT: mul a1, a1, a3
@@ -441,7 +441,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: vmerge.vvm v8, v16, v8, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 88
; RV32-NEXT: mul a1, a1, a3
@@ -452,13 +452,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0)
; RV32-NEXT: lui a3, %hi(.LCPI8_5)
; RV32-NEXT: addi a3, a3, %lo(.LCPI8_5)
-; RV32-NEXT: lui a4, 49164
+; RV32-NEXT: lui a4, 3073
; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; RV32-NEXT: vle16.v v24, (a3)
-; RV32-NEXT: addi a4, a4, 12
+; RV32-NEXT: addi a3, a4, -1024
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV32-NEXT: vle16.v v2, (a1)
-; RV32-NEXT: vmv.s.x v0, a4
+; RV32-NEXT: vmv.s.x v0, a3
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
@@ -687,226 +687,233 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 92
+; RV64-NEXT: li a3, 88
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdc, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 92 * vlenb
-; RV64-NEXT: addi a2, a1, 128
-; RV64-NEXT: addi a5, a1, 256
-; RV64-NEXT: li a3, 128
-; RV64-NEXT: lui a4, 4
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v0, (a5)
-; RV64-NEXT: lui a5, 16
-; RV64-NEXT: addi a5, a5, 7
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v8, a5
+; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb
+; RV64-NEXT: addi a3, a1, 128
+; RV64-NEXT: addi a6, a1, 256
+; RV64-NEXT: li a4, 128
+; RV64-NEXT: lui a2, 1
; RV64-NEXT: lui a5, %hi(.LCPI8_0)
; RV64-NEXT: addi a5, a5, %lo(.LCPI8_0)
-; RV64-NEXT: vmv.v.i v9, 6
-; RV64-NEXT: addi a4, a4, 260
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.v.i v16, 6
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vle64.v v8, (a6)
+; RV64-NEXT: lui a6, 16
+; RV64-NEXT: addi a6, a6, 7
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.v.x v17, a6
+; RV64-NEXT: addi a6, a2, 65
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vrgather.vi v12, v0, 4
-; RV64-NEXT: csrr a6, vlenb
-; RV64-NEXT: li a7, 88
-; RV64-NEXT: mul a6, a6, a7
-; RV64-NEXT: add a6, sp, a6
-; RV64-NEXT: addi a6, a6, 16
-; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
-; RV64-NEXT: vrgather.vi v12, v0, 5
-; RV64-NEXT: csrr a6, vlenb
-; RV64-NEXT: li a7, 72
-; RV64-NEXT: mul a6, a6, a7
-; RV64-NEXT: add a6, sp, a6
-; RV64-NEXT: addi a6, a6, 16
-; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
-; RV64-NEXT: vrgatherei16.vv v12, v0, v9
-; RV64-NEXT: csrr a6, vlenb
-; RV64-NEXT: li a7, 56
-; RV64-NEXT: mul a6, a6, a7
-; RV64-NEXT: add a6, sp, a6
-; RV64-NEXT: addi a6, a6, 16
-; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
-; RV64-NEXT: vrgatherei16.vv v12, v0, v8
-; RV64-NEXT: csrr a6, vlenb
-; RV64-NEXT: li a7, 76
-; RV64-NEXT: mul a6, a6, a7
-; RV64-NEXT: add a6, sp, a6
-; RV64-NEXT: addi a6, a6, 16
-; RV64-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill
-; RV64-NEXT: vrgather.vi v8, v0, 2
-; RV64-NEXT: csrr a6, vlenb
-; RV64-NEXT: li a7, 68
-; RV64-NEXT: mul a6, a6, a7
-; RV64-NEXT: add a6, sp, a6
-; RV64-NEXT: addi a6, a6, 16
-; RV64-NEXT: vs4r.v v8, (a6) # Unknown-size Folded Spill
-; RV64-NEXT: vrgather.vi v8, v0, 3
-; RV64-NEXT: csrr a6, vlenb
-; RV64-NEXT: slli a6, a6, 6
-; RV64-NEXT: add a6, sp, a6
-; RV64-NEXT: addi a6, a6, 16
-; RV64-NEXT: vs4r.v v8, (a6) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v24, v8, 4
+; RV64-NEXT: vrgather.vi v20, v8, 5
+; RV64-NEXT: csrr a7, vlenb
+; RV64-NEXT: li t0, 68
+; RV64-NEXT: mul a7, a7, t0
+; RV64-NEXT: add a7, sp, a7
+; RV64-NEXT: addi a7, a7, 16
+; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v20, v8, v16
+; RV64-NEXT: csrr a7, vlenb
+; RV64-NEXT: li t0, 84
+; RV64-NEXT: mul a7, a7, t0
+; RV64-NEXT: add a7, sp, a7
+; RV64-NEXT: addi a7, a7, 16
+; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v20, v8, v17
+; RV64-NEXT: csrr a7, vlenb
+; RV64-NEXT: li t0, 72
+; RV64-NEXT: mul a7, a7, t0
+; RV64-NEXT: add a7, sp, a7
+; RV64-NEXT: addi a7, a7, 16
+; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v16, v8, 2
+; RV64-NEXT: csrr a7, vlenb
+; RV64-NEXT: slli a7, a7, 6
+; RV64-NEXT: add a7, sp, a7
+; RV64-NEXT: addi a7, a7, 16
+; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v16, v8, 3
+; RV64-NEXT: csrr a7, vlenb
+; RV64-NEXT: li t0, 56
+; RV64-NEXT: mul a7, a7, t0
+; RV64-NEXT: add a7, sp, a7
+; RV64-NEXT: addi a7, a7, 16
+; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v0, 8
-; RV64-NEXT: vmv.s.x v0, a3
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: slli a3, a3, 3
-; RV64-NEXT: add a3, sp, a3
-; RV64-NEXT: addi a3, a3, 16
-; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vslidedown.vi v16, v8, 8
+; RV64-NEXT: csrr a7, vlenb
+; RV64-NEXT: li t0, 48
+; RV64-NEXT: mul a7, a7, t0
+; RV64-NEXT: add a7, sp, a7
+; RV64-NEXT: addi a7, a7, 16
+; RV64-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: vmv.s.x v21, a4
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v24, (a1)
-; RV64-NEXT: vle64.v v16, (a2)
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 48
-; RV64-NEXT: mul a1, a1, a2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vle16.v v12, (a5)
+; RV64-NEXT: vle64.v v8, (a1)
+; RV64-NEXT: vle64.v v0, (a3)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 5
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs2r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv.s.x v6, a4
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 88
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vle16.v v2, (a5)
+; RV64-NEXT: vmv.s.x v20, a6
+; RV64-NEXT: vmv1r.v v0, v21
+; RV64-NEXT: vmv1r.v v7, v21
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v12, v8, 2, v0.t
+; RV64-NEXT: vrgather.vi v24, v16, 2, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 80
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 60
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v20
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 88
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v6
-; RV64-NEXT: vmv8r.v v8, v24
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vmerge.vvm v24, v24, v16, v0
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 5
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v0, v24, v16
+; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT: vmv8r.v v16, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 40
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 76
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: lui a1, 8
-; RV64-NEXT: lui a2, %hi(.LCPI8_1)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI8_1)
-; RV64-NEXT: addi a1, a1, 520
-; RV64-NEXT: vle16.v v4, (a2)
-; RV64-NEXT: vmv.s.x v6, a1
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v8, v24, v2
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: slli a1, a1, 5
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl1r.v v3, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: lui a3, %hi(.LCPI8_1)
+; RV64-NEXT: addi a3, a3, %lo(.LCPI8_1)
+; RV64-NEXT: addi a1, a1, 130
+; RV64-NEXT: vle16.v v8, (a3)
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs2r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vmv.s.x v2, a1
+; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs1r.v v7, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 72
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 68
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 80
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 48
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v24, v16, 3, v0.t
+; RV64-NEXT: vrgather.vi v24, v8, 3, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 72
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 68
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v6
+; RV64-NEXT: vmv1r.v v0, v2
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 48
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
-; RV64-NEXT: vmv8r.v v16, v8
-; RV64-NEXT: vrgatherei16.vv v8, v24, v4
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: li a1, 1040
-; RV64-NEXT: lui a2, 1
-; RV64-NEXT: addi a3, a2, -2016
+; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vrgatherei16.vv v0, v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: lui a3, 8
+; RV64-NEXT: addi a1, a1, 260
+; RV64-NEXT: addi a3, a3, 520
; RV64-NEXT: vmv.s.x v0, a1
; RV64-NEXT: vmv.s.x v2, a3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 48
+; RV64-NEXT: li a3, 76
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: slli a1, a1, 3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl1r.v v7, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 56
+; RV64-NEXT: li a3, 84
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 80
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v4, v24, 4, v0.t
+; RV64-NEXT: vrgather.vi v24, v16, 4, v0.t
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 84
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vmv1r.v v0, v2
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 24
+; RV64-NEXT: li a3, 76
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: vmv8r.v v16, v8
+; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 76
+; RV64-NEXT: li a3, 72
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 80
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
@@ -915,7 +922,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vrgather.vi v12, v24, 5, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 76
+; RV64-NEXT: li a3, 72
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
@@ -927,142 +934,130 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vmv.v.x v12, a1
; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 68
-; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: slli a1, a1, 6
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vrgatherei16.vv v24, v8, v12, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 68
-; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: slli a1, a1, 6
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: lui a1, 112
-; RV64-NEXT: addi a2, a2, 65
-; RV64-NEXT: addi a1, a1, 1
-; RV64-NEXT: vmv.s.x v0, a2
+; RV64-NEXT: lui a1, %hi(.LCPI8_2)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2)
+; RV64-NEXT: li a3, 1040
+; RV64-NEXT: lui a4, 112
+; RV64-NEXT: addi a4, a4, 1
+; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v2, a1
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 48
-; RV64-NEXT: mul a1, a1, a2
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv.v.x v12, a4
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
+; RV64-NEXT: vle16.v v6, (a1)
+; RV64-NEXT: vmv8r.v v24, v16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: li a3, 76
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v16, v24, v16, v0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 6
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 80
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgatherei16.vv v24, v16, v2, v0.t
+; RV64-NEXT: vrgatherei16.vv v16, v8, v12, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: li a3, 56
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: lui a1, %hi(.LCPI8_2)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2)
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle16.v v2, (a1)
-; RV64-NEXT: lui a1, 2
-; RV64-NEXT: addi a1, a1, 130
+; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: addi a1, a2, -2016
; RV64-NEXT: vmv.s.x v0, a1
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: slli a1, a1, 3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v24, v16, v2
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vrgatherei16.vv v16, v8, v6
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 56
+; RV64-NEXT: li a2, 76
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v8, v24, v8, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 80
+; RV64-NEXT: li a2, 76
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: lui a1, %hi(.LCPI8_3)
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3)
-; RV64-NEXT: vle16.v v12, (a1)
+; RV64-NEXT: vle16.v v24, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 88
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: slli a1, a1, 5
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 40
+; RV64-NEXT: li a2, 60
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v8, v16
+; RV64-NEXT: vmv.v.v v8, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 88
+; RV64-NEXT: li a2, 60
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 72
+; RV64-NEXT: li a2, 68
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: li a2, 24
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmv.v.v v0, v16
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv.v.v v0, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 56
+; RV64-NEXT: li a2, 84
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v4, v16
+; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 56
+; RV64-NEXT: li a2, 84
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 24
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v24, v16, v12
+; RV64-NEXT: vrgatherei16.vv v16, v8, v24
; RV64-NEXT: lui a1, %hi(.LCPI8_4)
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4)
; RV64-NEXT: vle16.v v8, (a1)
@@ -1070,30 +1065,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5)
; RV64-NEXT: vle16.v v6, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 76
+; RV64-NEXT: li a2, 72
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v12, v24
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vmv.v.v v12, v16
+; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vrgatherei16.vv v24, v16, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 68
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: slli a1, a1, 6
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
; RV64-NEXT: vmv.v.v v8, v24
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 80
+; RV64-NEXT: li a2, 76
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
@@ -1101,38 +1092,39 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vrgatherei16.vv v24, v16, v6
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: li a2, 56
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
; RV64-NEXT: vmv.v.v v16, v24
+; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vse64.v v8, (a1)
+; RV64-NEXT: addi a1, a0, 320
+; RV64-NEXT: vse64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 192
+; RV64-NEXT: vse64.v v12, (a1)
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 56
+; RV64-NEXT: li a3, 84
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl4r.v v20, (a2) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v20, (a1)
-; RV64-NEXT: addi a1, a0, 320
-; RV64-NEXT: vse64.v v16, (a1)
-; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; RV64-NEXT: vse64.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 192
-; RV64-NEXT: vse64.v v12, (a1)
; RV64-NEXT: addi a1, a0, 64
; RV64-NEXT: vse64.v v0, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 88
+; RV64-NEXT: li a2, 60
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a1, 92
+; RV64-NEXT: li a1, 88
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 8de1ed908f0227..10dadbc022e02e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -15,12 +15,12 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v9, (a0)
-; CHECK-NEXT: li a0, 146
+; CHECK-NEXT: li a0, 73
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v10, v8, 8
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: vrgather.vv v10, v8, v9
; CHECK-NEXT: vse8.v v10, (a1)
; CHECK-NEXT: ret
@@ -40,12 +40,12 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0)
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v9, (a0)
-; CHECK-NEXT: li a0, 36
+; CHECK-NEXT: li a0, 146
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v10, v8, 8
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: vrgather.vv v10, v8, v9
; CHECK-NEXT: vse8.v v10, (a1)
; CHECK-NEXT: ret
@@ -99,14 +99,14 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a0, 132
+; CHECK-NEXT: li a0, 33
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: lui a0, 28704
; CHECK-NEXT: addi a0, a0, 1280
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v9, v8, 8
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
>From d074e75afe8db0d8796365b533a8346abdce5342 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Dec 2024 11:48:22 +0800
Subject: [PATCH 4/5] Bail on identity shuffles
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 +++++----
.../CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll | 12 ++++--------
.../CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll | 12 ++++--------
3 files changed, 13 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 25ec751992ac34..096b9fa79173fe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5602,12 +5602,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
}
// If the mask indices are disjoint between the two sources, we can lower it
- // as a vselect + a single source vrgather.vv. Don't do this if the operands
- // will be splatted since they will be lowered to something cheaper like
- // vrgather.vi anyway.
+ // as a vselect + a single source vrgather.vv. Don't do this if we think the
+ // operands may end up being lowered to something cheaper than a vrgather.vv.
if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
- !ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT))
+ !ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT) &&
+ !ShuffleVectorInst::isIdentityMask(ShuffleMaskLHS, NumElts) &&
+ !ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts))
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
return V;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 6e40bf64e0d782..41d8abb9b73ebc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -421,15 +421,11 @@ define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI31_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: li a0, 271
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: li a0, -272
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v18, v16
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
; CHECK-NEXT: ret
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
ret <16 x float> %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index c759152a730a12..962bf919008d45 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1092,15 +1092,11 @@ define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32>
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(.LCPI71_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: li a0, 271
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: li a0, -272
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vsext.vf2 v18, v16
-; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
+; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
; CHECK-NEXT: ret
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
ret <16 x i32> %out
>From 7a8309574e778a9db8f32557df37c83aff54b00c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 12 Dec 2024 13:39:19 +0800
Subject: [PATCH 5/5] Update tests after rebase
---
.../RISCV/rvv/fixed-vectors-int-shuffles.ll | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 962bf919008d45..10156141119a7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1070,8 +1070,8 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) {
define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI70_0)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0)
+; CHECK-NEXT: lui a0, %hi(.LCPI74_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vle8.v v16, (a0)
; CHECK-NEXT: lui a0, 11
@@ -1090,8 +1090,8 @@ define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_identity:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI71_0)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0)
+; CHECK-NEXT: lui a0, %hi(.LCPI75_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI75_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v16, (a0)
; CHECK-NEXT: li a0, -272
@@ -1105,8 +1105,8 @@ define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32>
define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI72_0)
-; CHECK-NEXT: addi a0, a0, %lo(.LCPI72_0)
+; CHECK-NEXT: lui a0, %hi(.LCPI76_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI76_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v20, (a0)
; CHECK-NEXT: lui a0, 15
@@ -1123,8 +1123,8 @@ define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32
define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
; CHECK-LABEL: shuffle_disjoint_lanes_one_splat:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI73_0)
-; CHECK-NEXT: addi a1, a1, %lo(.LCPI73_0)
+; CHECK-NEXT: lui a1, %hi(.LCPI77_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI77_0)
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; CHECK-NEXT: vle16.v v16, (a1)
; CHECK-NEXT: lui a1, 15
More information about the llvm-commits
mailing list