[llvm] [MachineScheduler][RISCV] Release the pending queue base on condition (PR #125468)
Piyou Chen via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 9 20:50:44 PST 2025
https://github.com/BeMg updated https://github.com/llvm/llvm-project/pull/125468
>From b942cc2b655b741ee59d8608af4da416b05c4a43 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Tue, 21 Jan 2025 21:09:40 -0800
Subject: [PATCH 01/11] [MachineScheduler][RISCV] Release the pending queue
base on condition
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 9 +
.../include/llvm/CodeGen/TargetRegisterInfo.h | 12 +
llvm/lib/CodeGen/MachineScheduler.cpp | 45 +
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 33 +
llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 7 +
.../CodeGen/RISCV/rvv/bitreverse-sdnode.ll | 44 +-
llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll | 248 +++---
llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll | 44 +-
llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll | 302 ++++---
.../CodeGen/RISCV/rvv/calling-conv-fastcc.ll | 110 +--
llvm/test/CodeGen/RISCV/rvv/calling-conv.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/expandload.ll | 548 +++++-------
.../rvv/fixed-vectors-bitcast-large-vector.ll | 8 +-
.../RISCV/rvv/fixed-vectors-bitreverse-vp.ll | 228 ++---
.../RISCV/rvv/fixed-vectors-bswap-vp.ll | 204 ++---
.../rvv/fixed-vectors-calling-conv-fastcc.ll | 14 +-
.../RISCV/rvv/fixed-vectors-calling-conv.ll | 14 +-
.../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 840 ++++++------------
.../RISCV/rvv/fixed-vectors-ctpop-vp.ll | 372 ++++----
.../RISCV/rvv/fixed-vectors-cttz-vp.ll | 780 +++++++---------
.../RISCV/rvv/fixed-vectors-fmaximum-vp.ll | 44 +-
.../RISCV/rvv/fixed-vectors-fminimum-vp.ll | 44 +-
.../rvv/fixed-vectors-interleaved-access.ll | 693 ++++++++-------
.../RISCV/rvv/fixed-vectors-reduction-fp.ll | 320 ++++---
.../RISCV/rvv/fixed-vectors-reduction-int.ll | 224 ++---
.../RISCV/rvv/fixed-vectors-select-addsub.ll | 15 +-
.../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 42 +-
.../RISCV/rvv/fixed-vectors-setcc-int-vp.ll | 36 +-
.../RISCV/rvv/fixed-vectors-strided-vpload.ll | 16 +-
.../RISCV/rvv/fixed-vectors-trunc-vp.ll | 80 +-
.../RISCV/rvv/fixed-vectors-vcopysign-vp.ll | 42 +-
.../RISCV/rvv/fixed-vectors-vfma-vp.ll | 80 +-
.../RISCV/rvv/fixed-vectors-vfmax-vp.ll | 42 +-
.../RISCV/rvv/fixed-vectors-vfmin-vp.ll | 42 +-
.../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll | 80 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll | 2 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll | 2 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll | 2 +-
.../RISCV/rvv/fixed-vectors-vpmerge.ll | 10 +-
.../RISCV/rvv/fixed-vectors-vpscatter.ll | 10 +-
.../RISCV/rvv/fixed-vectors-vscale-range.ll | 104 +--
.../RISCV/rvv/fixed-vectors-vselect-vp.ll | 99 +--
.../CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll | 4 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll | 4 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll | 4 +-
.../RISCV/rvv/fixed-vectors-vwmulsu.ll | 4 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll | 4 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll | 4 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll | 229 +++--
llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll | 28 +-
.../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll | 58 +-
.../test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll | 322 ++++---
llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll | 72 +-
.../RISCV/rvv/vector-deinterleave-load.ll | 20 +-
llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 161 +---
llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll | 145 +--
llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll | 35 +-
llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll | 14 +-
.../CodeGen/RISCV/rvv/vpscatter-sdnode.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll | 62 +-
llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll | 35 +-
65 files changed, 3259 insertions(+), 3965 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 660670ccdcd75b4..47809606ff40754 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1068,6 +1068,13 @@ class SchedBoundary {
/// Dump the state of the information that tracks resource usage.
void dumpReservedCycles() const;
void dumpScheduledState() const;
+
+ void bumpCycleUntilReleaseSUFromPending(SUnit *SU) {
+ while (!Pending.empty() && llvm::find(Pending, SU) != Pending.end()) {
+ bumpCycle(CurrCycle + 1);
+ releasePending();
+ }
+ }
};
/// Base class for GenericScheduler. This class maintains information about
@@ -1262,6 +1269,8 @@ class GenericScheduler : public GenericSchedulerBase {
BotCand.SU = nullptr;
}
+ void bumpCycleUntilReleaseSUFromPending(bool IsTop);
+
void registerRoots() override;
protected:
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 114149ff53d850b..26107eb8e960b11 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1233,6 +1233,18 @@ class TargetRegisterInfo : public MCRegisterInfo {
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const {
return {};
}
+
+ virtual bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const {
+ return false;
+ }
+
+ virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ return false;
+ }
};
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 393530f56cc27ee..586c8857bb199fc 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -161,6 +161,10 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
cl::desc("Enable memop clustering."),
cl::init(true));
+static cl::opt<bool>
+ EnableReleasePendingQ("misched-release-pending-queue", cl::Hidden,
+ cl::desc("Release the pending queue"),
+ cl::init(true));
static cl::opt<bool>
ForceFastCluster("force-fast-cluster", cl::Hidden,
cl::desc("Switch to fast cluster algorithm with the lost "
@@ -3656,6 +3660,37 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
}
}
+void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) {
+ if (!DAG->isTrackingPressure())
+ return;
+ auto releasePending = [&](ReadyQueue &Q, const RegPressureTracker &RegP,
+ ArrayRef<unsigned> MaxSetP, SchedBoundary &SchedB) {
+ for (SUnit *SU : Q) {
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RegP);
+ CandPolicy TempPolicy;
+ SchedCandidate TryCand(TempPolicy);
+ initCandidate(TryCand, SU, IsTop, RegP, TempTracker);
+ PressureDiff PDiff = DAG->getPressureDiff(SU);
+ SmallVector<unsigned> PSetIDs;
+ SmallVector<int> UnitIncs;
+ for (const auto &PChange : PDiff) {
+ if (!PChange.isValid())
+ continue;
+ PSetIDs.push_back(PChange.getPSet());
+ UnitIncs.push_back(PChange.getUnitInc());
+ }
+ if (TRI->needReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
+ SchedB.bumpCycleUntilReleaseSUFromPending(SU);
+ }
+ };
+ if (IsTop)
+ releasePending(Top.Pending, DAG->getTopRPTracker(),
+ DAG->getTopRPTracker().getPressure().MaxSetPressure, Top);
+ else
+ releasePending(Bot.Pending, DAG->getBotRPTracker(),
+ DAG->getBotRPTracker().getPressure().MaxSetPressure, Bot);
+}
+
/// Pick the best candidate node from either the top or bottom queue.
SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most
@@ -3741,6 +3776,16 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
+
+ if (EnableReleasePendingQ && !RegionPolicy.OnlyBottomUp &&
+ TRI->needReleasePendingQueue(
+ DAG->MF, DAG->getTopRPTracker().getPressure().MaxSetPressure))
+ bumpCycleUntilReleaseSUFromPending(/*IsTop=*/true);
+ if (EnableReleasePendingQ && !RegionPolicy.OnlyTopDown &&
+ TRI->needReleasePendingQueue(
+ DAG->MF, DAG->getBotRPTracker().getPressure().MaxSetPressure))
+ bumpCycleUntilReleaseSUFromPending(/*IsTop=*/false);
+
SUnit *SU;
do {
if (RegionPolicy.OnlyTopDown) {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b0a52698c1e9f10..fca11f0d1365426 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -954,3 +954,36 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
+
+bool RISCVRegisterInfo::needReleasePendingQueue(
+ MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
+ for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
+ // Consider only the RVV Register, as RVV spilling/reloading has higher
+ // potential costs than hazards.
+ if (!StringRef(getRegPressureSetName(Idx)).starts_with("VM") &&
+ !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ continue;
+ const unsigned RVVRegPressureThreshold = 7;
+ if (MaxSetPressure[Idx] + RVVRegPressureThreshold >
+ getRegPressureSetLimit(MF, Idx)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool RISCVRegisterInfo::needReleaseSUFromPendingQueue(
+ MachineFunction &MF, ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ const int UnitIncRVVRegPressureThreshold = -3;
+ for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
+ // Consider only the RVV Register, as RVV spilling/reloading has higher
+ // potential costs than hazards.
+ if (!StringRef(getRegPressureSetName(PSetID[Idx])).starts_with("VM") &&
+ !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ continue;
+ if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold)
+ return true;
+ }
+ return false;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8a..faf81b2d8b73d65 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -144,6 +144,13 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
static bool isRVVRegClass(const TargetRegisterClass *RC) {
return RISCVRI::isVRegClass(RC->TSFlags);
}
+ bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const override;
+
+ bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const override;
};
} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 1ed84316d4484cd..cd795f722676bd9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -1137,38 +1137,38 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: li a1, 56
; RV32-NEXT: li a2, 40
; RV32-NEXT: lui a3, 16
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: addi a5, sp, 8
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vsrl.vx v24, v8, a2
-; RV32-NEXT: addi a0, a3, -256
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsll.vx v0, v8, a1
-; RV32-NEXT: vand.vx v24, v24, a0
+; RV32-NEXT: vand.vx v24, v24, a3
; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a3
; RV32-NEXT: vsll.vx v16, v16, a2
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a5), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a4
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: lui a1, 4080
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24
-; RV32-NEXT: vand.vv v16, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: vand.vx v8, v8, a1
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 4d34621cd5f243c..8ae560c07e21016 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -2288,66 +2288,68 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v24, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT: vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT: vand.vx v16, v8, a5, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -2497,40 +2499,40 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2
-; RV32-NEXT: addi a1, a3, -256
; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a1
+; RV32-NEXT: vand.vx v0, v0, a3
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v24, v8, a1
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v24, v8, a3
; RV32-NEXT: vsll.vx v24, v24, a4
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v24, (a6), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v24, a5
+; RV32-NEXT: vsrl.vi v24, v8, 8
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v0, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v24, v8, v24
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v24, v8, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v0, v8
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: lui a3, 349525
@@ -2673,66 +2675,68 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v24, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a4, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: slli a5, a5, 4
-; RV32-NEXT: add a5, sp, a5
-; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vx v16, v24, a2, v0.t
+; RV32-NEXT: vsrl.vx v8, v24, a4, v0.t
+; RV32-NEXT: vand.vx v8, v8, a1, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT: vand.vx v16, v8, a5, v0.t
+; RV32-NEXT: vsrl.vi v8, v24, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -2882,40 +2886,40 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v16, v8, a2
-; RV32-NEXT: addi a1, a3, -256
; RV32-NEXT: vsrl.vx v24, v8, a2
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a1
+; RV32-NEXT: vand.vx v0, v0, a3
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v24, v8, a1
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v24, v8, a3
; RV32-NEXT: vsll.vx v24, v24, a4
; RV32-NEXT: vor.vv v16, v16, v24
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v24, (a6), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
-; RV32-NEXT: vsrl.vi v0, v8, 8
-; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v24, a5
+; RV32-NEXT: vsrl.vi v24, v8, 8
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v0, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v24, v8, v24
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v24, v8, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v0, v8
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: lui a3, 349525
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index 2cd763afa36b73f..2de70a8cf1ce0c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -518,38 +518,38 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
; RV32-NEXT: li a1, 56
; RV32-NEXT: li a2, 40
; RV32-NEXT: lui a3, 16
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: addi a5, sp, 8
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v8, a1
; RV32-NEXT: vsrl.vx v24, v8, a2
-; RV32-NEXT: addi a0, a3, -256
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsll.vx v0, v8, a1
-; RV32-NEXT: vand.vx v24, v24, a0
+; RV32-NEXT: vand.vx v24, v24, a3
; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vand.vx v16, v8, a3
; RV32-NEXT: vsll.vx v16, v16, a2
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a5), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a4
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: lui a1, 4080
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a1
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24
-; RV32-NEXT: vand.vv v16, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
+; RV32-NEXT: vand.vx v8, v8, a1
; RV32-NEXT: vsll.vi v8, v8, 24
; RV32-NEXT: vsll.vi v16, v16, 8
; RV32-NEXT: vor.vv v8, v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0c58cca0f94726b..09df35600bce832 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1022,59 +1022,61 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: lui a3, 16
-; RV32-NEXT: li a4, 40
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v16, v8
+; RV32-NEXT: lui a0, 1044480
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: lui a4, 4080
; RV32-NEXT: addi a5, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT: addi a0, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a0, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi a0, a2, -256
+; RV32-NEXT: vand.vx v24, v16, a0, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v8, v16, a4, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a5), zero
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 8, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v8, v16, a1, v0.t
+; RV32-NEXT: vsrl.vx v24, v16, a3, v0.t
; RV32-NEXT: vand.vx v24, v24, a0, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
@@ -1174,45 +1176,46 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v24, v8, a2
-; RV32-NEXT: addi a0, a3, -256
; RV32-NEXT: vsrl.vx v16, v8, a2
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: addi a2, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a0
+; RV32-NEXT: vand.vx v0, v0, a2
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v8, a2
; RV32-NEXT: vsll.vx v0, v0, a4
; RV32-NEXT: vor.vv v16, v24, v0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a6), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a5
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
+; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
@@ -1292,59 +1295,61 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-NEXT: lui a1, 1044480
-; RV32-NEXT: li a2, 56
-; RV32-NEXT: lui a3, 16
-; RV32-NEXT: li a4, 40
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v16, v8
+; RV32-NEXT: lui a0, 1044480
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: lui a4, 4080
; RV32-NEXT: addi a5, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT: addi a0, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a0, v0.t
-; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v16, (a5), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi a0, a2, -256
+; RV32-NEXT: vand.vx v24, v16, a0, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v8, v16, a4, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vlse64.v v8, (a5), zero
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 8, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v8, v16, a1, v0.t
+; RV32-NEXT: vsrl.vx v24, v16, a3, v0.t
; RV32-NEXT: vand.vx v24, v24, a0, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
@@ -1444,45 +1449,46 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v24, v8, a2
-; RV32-NEXT: addi a0, a3, -256
; RV32-NEXT: vsrl.vx v16, v8, a2
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: addi a2, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a0
+; RV32-NEXT: vand.vx v0, v0, a2
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v8, a2
; RV32-NEXT: vsll.vx v0, v0, a4
; RV32-NEXT: vor.vv v16, v24, v0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v0, (a6), zero
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a5
; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
+; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v8, v8, v24
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index 15f6ca600cb37ec..799d12247e905e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -75,17 +75,17 @@ define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) {
; CHECK-NEXT: slli a4, a2, 5
; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub a4, a4, a3
-; CHECK-NEXT: add a5, a1, a2
-; CHECK-NEXT: vl8re32.v v16, (a5)
; CHECK-NEXT: add a5, a1, a3
+; CHECK-NEXT: vl8re32.v v16, (a5)
+; CHECK-NEXT: add a5, a1, a2
; CHECK-NEXT: add a2, a0, a2
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: add a1, a1, a4
; CHECK-NEXT: vl8re32.v v24, (a5)
; CHECK-NEXT: vl8re32.v v0, (a1)
; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: vs8r.v v16, (a2)
-; CHECK-NEXT: vs8r.v v24, (a3)
+; CHECK-NEXT: vs8r.v v24, (a2)
+; CHECK-NEXT: vs8r.v v16, (a3)
; CHECK-NEXT: add a0, a0, a4
; CHECK-NEXT: vs8r.v v0, (a0)
; CHECK-NEXT: ret
@@ -245,59 +245,21 @@ define fastcc <vscale x 32 x i1> @ret_nxv32i1_param_nxv32i1_nxv32i1(<vscale x 32
define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %w) {
; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vl8re32.v v8, (a2)
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re32.v v24, (a2)
; CHECK-NEXT: vl8re32.v v0, (a0)
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a2, a2, a1
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl8re32.v v8, (a0)
-; CHECK-NEXT: vl8re32.v v16, (a2)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT: vadd.vv v0, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vv v24, v0, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vv v8, v0, v8
-; CHECK-NEXT: vadd.vv v8, v8, v16
-; CHECK-NEXT: vadd.vx v16, v8, a4
-; CHECK-NEXT: vadd.vx v8, v24, a4
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v0
+; CHECK-NEXT: vl8re32.v v0, (a2)
+; CHECK-NEXT: vadd.vv v8, v8, v24
+; CHECK-NEXT: vl8re32.v v24, (a0)
+; CHECK-NEXT: vadd.vv v16, v16, v24
+; CHECK-NEXT: vadd.vv v16, v16, v0
+; CHECK-NEXT: vadd.vx v16, v16, a4
+; CHECK-NEXT: vadd.vx v8, v8, a4
; CHECK-NEXT: ret
%r = add <vscale x 32 x i32> %x, %y
%s = add <vscale x 32 x i32> %r, %z
@@ -410,29 +372,31 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
; RV32-NEXT: andi sp, sp, -128
; RV32-NEXT: addi a1, sp, 128
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv8r.v v0, v8
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a2)
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 128
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a2, a2, a1
; RV32-NEXT: add a3, a0, a1
-; RV32-NEXT: vl8re32.v v0, (a2)
-; RV32-NEXT: vl8re32.v v24, (a3)
-; RV32-NEXT: vl8re32.v v16, (a0)
+; RV32-NEXT: vl8re32.v v24, (a2)
+; RV32-NEXT: vl8re32.v v16, (a3)
+; RV32-NEXT: vl8re32.v v8, (a0)
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vs8r.v v0, (a0)
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 128
-; RV32-NEXT: vs8r.v v16, (a3)
+; RV32-NEXT: vs8r.v v8, (a3)
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: addi a2, sp, 128
; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
@@ -447,14 +411,13 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
; RV32-NEXT: addi a2, a2, 128
; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: li a5, 42
-; RV32-NEXT: vs8r.v v24, (a1)
+; RV32-NEXT: vs8r.v v16, (a1)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 128
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv8r.v v16, v0
+; RV32-NEXT: vmv8r.v v16, v24
; RV32-NEXT: call ext3
; RV32-NEXT: addi sp, s0, -144
; RV32-NEXT: .cfi_def_cfa sp, 144
@@ -483,29 +446,31 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
; RV64-NEXT: andi sp, sp, -128
; RV64-NEXT: addi a1, sp, 128
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv8r.v v0, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a2)
; RV64-NEXT: csrr a3, vlenb
; RV64-NEXT: slli a3, a3, 3
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 128
-; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV64-NEXT: slli a1, a1, 3
; RV64-NEXT: add a2, a2, a1
; RV64-NEXT: add a3, a0, a1
-; RV64-NEXT: vl8re32.v v0, (a2)
-; RV64-NEXT: vl8re32.v v24, (a3)
-; RV64-NEXT: vl8re32.v v16, (a0)
+; RV64-NEXT: vl8re32.v v24, (a2)
+; RV64-NEXT: vl8re32.v v16, (a3)
+; RV64-NEXT: vl8re32.v v8, (a0)
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vs8r.v v0, (a0)
; RV64-NEXT: csrr a3, vlenb
; RV64-NEXT: slli a3, a3, 5
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 128
-; RV64-NEXT: vs8r.v v16, (a3)
+; RV64-NEXT: vs8r.v v8, (a3)
; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: addi a2, sp, 128
; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
@@ -520,14 +485,13 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
; RV64-NEXT: addi a2, a2, 128
; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: li a5, 42
-; RV64-NEXT: vs8r.v v24, (a1)
+; RV64-NEXT: vs8r.v v16, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 128
; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv8r.v v16, v0
+; RV64-NEXT: vmv8r.v v16, v24
; RV64-NEXT: call ext3
; RV64-NEXT: addi sp, s0, -144
; RV64-NEXT: .cfi_def_cfa sp, 144
@@ -549,9 +513,9 @@ define fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32
; CHECK-LABEL: vector_arg_indirect_stack:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vl8re32.v v24, (t5)
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, t5, a0
-; CHECK-NEXT: vl8re32.v v24, (t5)
; CHECK-NEXT: vl8re32.v v0, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vadd.vv v8, v8, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 2e181e0914c886a..5ed7308cd0b94ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -7,10 +7,10 @@ define <vscale x 32 x i32> @callee_scalable_vector_split_indirect(<vscale x 32 x
; CHECK-LABEL: callee_scalable_vector_split_indirect:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, a0, a1
; CHECK-NEXT: vl8re32.v v24, (a0)
-; CHECK-NEXT: vl8re32.v v0, (a1)
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl8re32.v v0, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vadd.vv v8, v8, v24
; CHECK-NEXT: vadd.vv v16, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index a35cf14203f7851..5d7fc3bfd103483 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -218,106 +218,72 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
; CHECK-RV32-NEXT: addi sp, sp, -16
; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
; CHECK-RV32-NEXT: csrr a2, vlenb
-; CHECK-RV32-NEXT: slli a2, a2, 5
+; CHECK-RV32-NEXT: slli a2, a2, 4
; CHECK-RV32-NEXT: sub sp, sp, a2
-; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-RV32-NEXT: csrr a2, vlenb
-; CHECK-RV32-NEXT: li a3, 24
-; CHECK-RV32-NEXT: mul a2, a2, a3
+; CHECK-RV32-NEXT: slli a2, a2, 3
; CHECK-RV32-NEXT: add a2, sp, a2
; CHECK-RV32-NEXT: addi a2, a2, 16
; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-RV32-NEXT: vmv1r.v v7, v8
; CHECK-RV32-NEXT: li a2, 128
-; CHECK-RV32-NEXT: vslidedown.vi v9, v0, 1
+; CHECK-RV32-NEXT: vslidedown.vi v6, v0, 1
; CHECK-RV32-NEXT: li a3, 32
; CHECK-RV32-NEXT: vmv.x.s a4, v0
; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-RV32-NEXT: viota.m v16, v0
+; CHECK-RV32-NEXT: addi a5, sp, 16
+; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT: vcpop.m a5, v0
+; CHECK-RV32-NEXT: vsetvli zero, a5, e8, m8, ta, ma
+; CHECK-RV32-NEXT: vle8.v v24, (a0)
+; CHECK-RV32-NEXT: csrr a5, vlenb
+; CHECK-RV32-NEXT: slli a5, a5, 3
+; CHECK-RV32-NEXT: add a5, sp, a5
+; CHECK-RV32-NEXT: addi a5, a5, 16
+; CHECK-RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT: addi a5, sp, 16
+; CHECK-RV32-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu
+; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV32-NEXT: csrr a5, vlenb
+; CHECK-RV32-NEXT: slli a5, a5, 3
+; CHECK-RV32-NEXT: add a5, sp, a5
+; CHECK-RV32-NEXT: addi a5, a5, 16
+; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: vle8.v v16, (a1)
-; CHECK-RV32-NEXT: csrr a1, vlenb
-; CHECK-RV32-NEXT: slli a1, a1, 3
-; CHECK-RV32-NEXT: add a1, sp, a1
-; CHECK-RV32-NEXT: addi a1, a1, 16
-; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT: vsrl.vx v10, v9, a3
+; CHECK-RV32-NEXT: vsrl.vx v10, v6, a3
; CHECK-RV32-NEXT: vsrl.vx v11, v0, a3
-; CHECK-RV32-NEXT: vmv.x.s a1, v9
+; CHECK-RV32-NEXT: vmv.x.s a1, v6
+; CHECK-RV32-NEXT: cpop a3, a4
; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT: vcpop.m a3, v0
-; CHECK-RV32-NEXT: cpop a4, a4
+; CHECK-RV32-NEXT: vcpop.m a4, v7
; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma
; CHECK-RV32-NEXT: vmv.x.s a5, v10
; CHECK-RV32-NEXT: vmv.x.s a6, v11
-; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-RV32-NEXT: vle8.v v8, (a0)
-; CHECK-RV32-NEXT: csrr a3, vlenb
-; CHECK-RV32-NEXT: slli a3, a3, 4
-; CHECK-RV32-NEXT: add a3, sp, a3
-; CHECK-RV32-NEXT: addi a3, a3, 16
-; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: cpop a1, a1
-; CHECK-RV32-NEXT: cpop a3, a6
+; CHECK-RV32-NEXT: cpop a6, a6
; CHECK-RV32-NEXT: cpop a5, a5
-; CHECK-RV32-NEXT: add a3, a4, a3
+; CHECK-RV32-NEXT: add a3, a3, a6
; CHECK-RV32-NEXT: add a1, a1, a5
; CHECK-RV32-NEXT: add a1, a3, a1
; CHECK-RV32-NEXT: add a0, a0, a1
-; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT: vcpop.m a1, v7
-; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-RV32-NEXT: vle8.v v8, (a0)
-; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m8, ta, ma
+; CHECK-RV32-NEXT: vle8.v v24, (a0)
; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV32-NEXT: viota.m v24, v0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a1, 24
-; CHECK-RV32-NEXT: mul a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a1, 24
-; CHECK-RV32-NEXT: mul a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: viota.m v16, v7
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT: viota.m v8, v7
; CHECK-RV32-NEXT: vmv1r.v v0, v7
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 3
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a1, 24
-; CHECK-RV32-NEXT: mul a0, a0, a1
+; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 5
+; CHECK-RV32-NEXT: slli a0, a0, 4
; CHECK-RV32-NEXT: add sp, sp, a0
; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16
; CHECK-RV32-NEXT: addi sp, sp, 16
@@ -329,96 +295,56 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
; CHECK-RV64-NEXT: addi sp, sp, -16
; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
; CHECK-RV64-NEXT: csrr a2, vlenb
-; CHECK-RV64-NEXT: slli a2, a2, 5
+; CHECK-RV64-NEXT: slli a2, a2, 4
; CHECK-RV64-NEXT: sub sp, sp, a2
-; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-RV64-NEXT: csrr a2, vlenb
-; CHECK-RV64-NEXT: li a3, 24
-; CHECK-RV64-NEXT: mul a2, a2, a3
-; CHECK-RV64-NEXT: add a2, sp, a2
-; CHECK-RV64-NEXT: addi a2, a2, 16
-; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-RV64-NEXT: vmv1r.v v7, v8
; CHECK-RV64-NEXT: li a2, 128
-; CHECK-RV64-NEXT: vslidedown.vi v9, v0, 1
+; CHECK-RV64-NEXT: vslidedown.vi v6, v0, 1
; CHECK-RV64-NEXT: vmv.x.s a3, v0
; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV64-NEXT: vle8.v v16, (a1)
-; CHECK-RV64-NEXT: csrr a1, vlenb
-; CHECK-RV64-NEXT: slli a1, a1, 3
-; CHECK-RV64-NEXT: add a1, sp, a1
-; CHECK-RV64-NEXT: addi a1, a1, 16
-; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-RV64-NEXT: vmv.x.s a1, v9
-; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-RV64-NEXT: viota.m v8, v0
+; CHECK-RV64-NEXT: addi a4, sp, 16
+; CHECK-RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: vcpop.m a4, v0
; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma
-; CHECK-RV64-NEXT: vle8.v v8, (a0)
+; CHECK-RV64-NEXT: vle8.v v24, (a0)
+; CHECK-RV64-NEXT: addi a4, sp, 16
+; CHECK-RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu
+; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t
; CHECK-RV64-NEXT: csrr a4, vlenb
-; CHECK-RV64-NEXT: slli a4, a4, 4
+; CHECK-RV64-NEXT: slli a4, a4, 3
; CHECK-RV64-NEXT: add a4, sp, a4
; CHECK-RV64-NEXT: addi a4, a4, 16
-; CHECK-RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vle8.v v16, (a1)
+; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV64-NEXT: vmv.x.s a1, v6
; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-RV64-NEXT: vcpop.m a4, v7
; CHECK-RV64-NEXT: cpop a3, a3
+; CHECK-RV64-NEXT: viota.m v24, v7
+; CHECK-RV64-NEXT: addi a5, sp, 16
+; CHECK-RV64-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: cpop a1, a1
; CHECK-RV64-NEXT: add a0, a0, a3
; CHECK-RV64-NEXT: add a0, a0, a1
; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma
-; CHECK-RV64-NEXT: vle8.v v8, (a0)
-; CHECK-RV64-NEXT: addi a0, sp, 16
-; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV64-NEXT: viota.m v24, v0
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a1, 24
-; CHECK-RV64-NEXT: mul a0, a0, a1
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a1, 24
-; CHECK-RV64-NEXT: mul a0, a0, a1
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: viota.m v16, v7
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vle8.v v24, (a0)
; CHECK-RV64-NEXT: vmv1r.v v0, v7
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 3
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: addi a0, sp, 16
-; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu
; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a1, 24
-; CHECK-RV64-NEXT: mul a0, a0, a1
+; CHECK-RV64-NEXT: slli a0, a0, 3
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 5
+; CHECK-RV64-NEXT: slli a0, a0, 4
; CHECK-RV64-NEXT: add sp, sp, a0
; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16
; CHECK-RV64-NEXT: addi sp, sp, 16
@@ -664,76 +590,66 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
; CHECK-RV32-NEXT: addi sp, sp, -16
; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
; CHECK-RV32-NEXT: csrr a1, vlenb
-; CHECK-RV32-NEXT: slli a1, a1, 5
-; CHECK-RV32-NEXT: sub sp, sp, a1
-; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-RV32-NEXT: csrr a1, vlenb
; CHECK-RV32-NEXT: li a2, 24
; CHECK-RV32-NEXT: mul a1, a1, a2
+; CHECK-RV32-NEXT: sub sp, sp, a1
+; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-RV32-NEXT: csrr a1, vlenb
+; CHECK-RV32-NEXT: slli a1, a1, 4
; CHECK-RV32-NEXT: add a1, sp, a1
; CHECK-RV32-NEXT: addi a1, a1, 16
; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: li a1, 64
+; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-RV32-NEXT: viota.m v16, v0
+; CHECK-RV32-NEXT: vcpop.m a2, v0
+; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-RV32-NEXT: vle16.v v24, (a0)
+; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT: csrr a2, vlenb
+; CHECK-RV32-NEXT: slli a2, a2, 3
+; CHECK-RV32-NEXT: add a2, sp, a2
+; CHECK-RV32-NEXT: addi a2, a2, 16
+; CHECK-RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 8
+; CHECK-RV32-NEXT: vslidedown.vi v24, v0, 8
; CHECK-RV32-NEXT: li a2, 32
; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; CHECK-RV32-NEXT: vmv.x.s a3, v0
; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT: vcpop.m a4, v0
+; CHECK-RV32-NEXT: vcpop.m a4, v24
; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT: vsrl.vx v25, v0, a2
-; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT: vcpop.m a2, v7
-; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma
-; CHECK-RV32-NEXT: vle16.v v16, (a0)
-; CHECK-RV32-NEXT: csrr a5, vlenb
-; CHECK-RV32-NEXT: slli a5, a5, 4
-; CHECK-RV32-NEXT: add a5, sp, a5
-; CHECK-RV32-NEXT: addi a5, a5, 16
-; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m1, ta, ma
-; CHECK-RV32-NEXT: vmv.x.s a4, v25
-; CHECK-RV32-NEXT: cpop a4, a4
+; CHECK-RV32-NEXT: vsrl.vx v8, v0, a2
+; CHECK-RV32-NEXT: cpop a2, a3
+; CHECK-RV32-NEXT: vmv.x.s a3, v8
; CHECK-RV32-NEXT: cpop a3, a3
-; CHECK-RV32-NEXT: add a3, a3, a4
-; CHECK-RV32-NEXT: slli a3, a3, 1
-; CHECK-RV32-NEXT: add a0, a0, a3
-; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT: vle16.v v16, (a0)
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 3
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV32-NEXT: viota.m v16, v0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT: add a2, a2, a3
+; CHECK-RV32-NEXT: slli a2, a2, 1
+; CHECK-RV32-NEXT: add a0, a0, a2
+; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma
+; CHECK-RV32-NEXT: vle16.v v8, (a0)
; CHECK-RV32-NEXT: addi a0, sp, 16
; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: viota.m v8, v7
-; CHECK-RV32-NEXT: vmv1r.v v0, v7
+; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV32-NEXT: viota.m v8, v24
+; CHECK-RV32-NEXT: vmv1r.v v0, v24
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a1, 24
-; CHECK-RV32-NEXT: mul a0, a0, a1
+; CHECK-RV32-NEXT: slli a0, a0, 4
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT: addi a0, sp, 16
+; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV32-NEXT: addi a0, sp, 16
; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 5
+; CHECK-RV32-NEXT: li a1, 24
+; CHECK-RV32-NEXT: mul a0, a0, a1
; CHECK-RV32-NEXT: add sp, sp, a0
; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16
; CHECK-RV32-NEXT: addi sp, sp, 16
@@ -749,67 +665,70 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
; CHECK-RV64-NEXT: sub sp, sp, a1
; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-RV64-NEXT: csrr a1, vlenb
-; CHECK-RV64-NEXT: slli a1, a1, 3
+; CHECK-RV64-NEXT: slli a1, a1, 4
; CHECK-RV64-NEXT: add a1, sp, a1
; CHECK-RV64-NEXT: addi a1, a1, 16
; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: li a1, 64
-; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 8
-; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-RV64-NEXT: viota.m v16, v0
+; CHECK-RV64-NEXT: csrr a2, vlenb
+; CHECK-RV64-NEXT: li a3, 24
+; CHECK-RV64-NEXT: mul a2, a2, a3
+; CHECK-RV64-NEXT: add a2, sp, a2
+; CHECK-RV64-NEXT: addi a2, a2, 16
+; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: vcpop.m a2, v0
-; CHECK-RV64-NEXT: vcpop.m a3, v7
; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma
; CHECK-RV64-NEXT: vle16.v v24, (a0)
+; CHECK-RV64-NEXT: csrr a3, vlenb
+; CHECK-RV64-NEXT: li a4, 24
+; CHECK-RV64-NEXT: mul a3, a3, a4
+; CHECK-RV64-NEXT: add a3, sp, a3
+; CHECK-RV64-NEXT: addi a3, a3, 16
+; CHECK-RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT: addi a3, sp, 16
+; CHECK-RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 8
+; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-RV64-NEXT: vcpop.m a3, v0
+; CHECK-RV64-NEXT: viota.m v24, v0
; CHECK-RV64-NEXT: csrr a4, vlenb
-; CHECK-RV64-NEXT: slli a4, a4, 4
+; CHECK-RV64-NEXT: li a5, 24
+; CHECK-RV64-NEXT: mul a4, a4, a5
; CHECK-RV64-NEXT: add a4, sp, a4
; CHECK-RV64-NEXT: addi a4, a4, 16
; CHECK-RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: slli a2, a2, 1
; CHECK-RV64-NEXT: add a0, a0, a2
; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma
-; CHECK-RV64-NEXT: vle16.v v24, (a0)
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a2, 24
-; CHECK-RV64-NEXT: mul a0, a0, a2
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV64-NEXT: viota.m v24, v0
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT: addi a0, sp, 16
-; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: viota.m v16, v7
+; CHECK-RV64-NEXT: vle16.v v16, (a0)
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
+; CHECK-RV64-NEXT: slli a0, a0, 3
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vmv1r.v v0, v7
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a1, 24
-; CHECK-RV64-NEXT: mul a0, a0, a1
+; CHECK-RV64-NEXT: li a2, 24
+; CHECK-RV64-NEXT: mul a0, a0, a2
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 3
+; CHECK-RV64-NEXT: slli a0, a0, 4
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
+; CHECK-RV64-NEXT: slli a0, a0, 3
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-RV64-NEXT: vrgather.vv v16, v8, v24, v0.t
; CHECK-RV64-NEXT: addi a0, sp, 16
; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
@@ -1023,67 +942,70 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
; CHECK-RV32-NEXT: sub sp, sp, a1
; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-RV32-NEXT: csrr a1, vlenb
-; CHECK-RV32-NEXT: slli a1, a1, 3
+; CHECK-RV32-NEXT: slli a1, a1, 4
; CHECK-RV32-NEXT: add a1, sp, a1
; CHECK-RV32-NEXT: addi a1, a1, 16
; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: li a1, 32
-; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 4
-; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-RV32-NEXT: viota.m v16, v0
+; CHECK-RV32-NEXT: csrr a2, vlenb
+; CHECK-RV32-NEXT: li a3, 24
+; CHECK-RV32-NEXT: mul a2, a2, a3
+; CHECK-RV32-NEXT: add a2, sp, a2
+; CHECK-RV32-NEXT: addi a2, a2, 16
+; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: vcpop.m a2, v0
-; CHECK-RV32-NEXT: vcpop.m a3, v7
; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-RV32-NEXT: vle32.v v24, (a0)
+; CHECK-RV32-NEXT: csrr a3, vlenb
+; CHECK-RV32-NEXT: li a4, 24
+; CHECK-RV32-NEXT: mul a3, a3, a4
+; CHECK-RV32-NEXT: add a3, sp, a3
+; CHECK-RV32-NEXT: addi a3, a3, 16
+; CHECK-RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT: addi a3, sp, 16
+; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 4
+; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-RV32-NEXT: vcpop.m a3, v0
+; CHECK-RV32-NEXT: viota.m v24, v0
; CHECK-RV32-NEXT: csrr a4, vlenb
-; CHECK-RV32-NEXT: slli a4, a4, 4
+; CHECK-RV32-NEXT: li a5, 24
+; CHECK-RV32-NEXT: mul a4, a4, a5
; CHECK-RV32-NEXT: add a4, sp, a4
; CHECK-RV32-NEXT: addi a4, a4, 16
; CHECK-RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: slli a2, a2, 2
; CHECK-RV32-NEXT: add a0, a0, a2
; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-RV32-NEXT: vle32.v v24, (a0)
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a2, 24
-; CHECK-RV32-NEXT: mul a0, a0, a2
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-RV32-NEXT: viota.m v24, v0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: viota.m v16, v7
+; CHECK-RV32-NEXT: vle32.v v16, (a0)
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
+; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: vmv1r.v v0, v7
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a1, 24
-; CHECK-RV32-NEXT: mul a0, a0, a1
+; CHECK-RV32-NEXT: li a2, 24
+; CHECK-RV32-NEXT: mul a0, a0, a2
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 3
+; CHECK-RV32-NEXT: slli a0, a0, 4
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 4
+; CHECK-RV32-NEXT: slli a0, a0, 3
; CHECK-RV32-NEXT: add a0, sp, a0
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-RV32-NEXT: vrgather.vv v16, v8, v24, v0.t
; CHECK-RV32-NEXT: addi a0, sp, 16
; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
@@ -1108,23 +1030,40 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
; CHECK-RV64-NEXT: addi a1, a1, 16
; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: li a1, 32
+; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-RV64-NEXT: viota.m v16, v0
+; CHECK-RV64-NEXT: csrr a2, vlenb
+; CHECK-RV64-NEXT: li a3, 24
+; CHECK-RV64-NEXT: mul a2, a2, a3
+; CHECK-RV64-NEXT: add a2, sp, a2
+; CHECK-RV64-NEXT: addi a2, a2, 16
+; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vcpop.m a2, v0
+; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-RV64-NEXT: vle32.v v24, (a0)
+; CHECK-RV64-NEXT: csrr a2, vlenb
+; CHECK-RV64-NEXT: li a3, 24
+; CHECK-RV64-NEXT: mul a2, a2, a3
+; CHECK-RV64-NEXT: add a2, sp, a2
+; CHECK-RV64-NEXT: addi a2, a2, 16
+; CHECK-RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT: addi a2, sp, 16
+; CHECK-RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 4
-; CHECK-RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-RV64-NEXT: vslidedown.vi v24, v0, 4
+; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-RV64-NEXT: vmv.x.s a2, v0
-; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV64-NEXT: vcpop.m a3, v0
-; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-RV64-NEXT: vle32.v v24, (a0)
-; CHECK-RV64-NEXT: csrr a3, vlenb
-; CHECK-RV64-NEXT: li a4, 24
-; CHECK-RV64-NEXT: mul a3, a3, a4
-; CHECK-RV64-NEXT: add a3, sp, a3
-; CHECK-RV64-NEXT: addi a3, a3, 16
-; CHECK-RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV64-NEXT: vcpop.m a3, v7
+; CHECK-RV64-NEXT: vcpop.m a3, v24
; CHECK-RV64-NEXT: cpopw a2, a2
+; CHECK-RV64-NEXT: viota.m v0, v24
+; CHECK-RV64-NEXT: csrr a4, vlenb
+; CHECK-RV64-NEXT: li a5, 24
+; CHECK-RV64-NEXT: mul a4, a4, a5
+; CHECK-RV64-NEXT: add a4, sp, a4
+; CHECK-RV64-NEXT: addi a4, a4, 16
+; CHECK-RV64-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: slli a2, a2, 2
; CHECK-RV64-NEXT: add a0, a0, a2
; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -1134,30 +1073,25 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-RV64-NEXT: viota.m v24, v0
+; CHECK-RV64-NEXT: vmv1r.v v0, v24
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a1, 24
-; CHECK-RV64-NEXT: mul a0, a0, a1
+; CHECK-RV64-NEXT: slli a0, a0, 4
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT: addi a0, sp, 16
-; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: viota.m v8, v7
-; CHECK-RV64-NEXT: vmv1r.v v0, v7
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 4
+; CHECK-RV64-NEXT: li a2, 24
+; CHECK-RV64-NEXT: mul a0, a0, a2
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
; CHECK-RV64-NEXT: slli a0, a0, 3
; CHECK-RV64-NEXT: add a0, sp, a0
; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-RV64-NEXT: vrgather.vv v16, v8, v24, v0.t
; CHECK-RV64-NEXT: addi a0, sp, 16
; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
@@ -1329,33 +1263,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
; CHECK-RV32-NEXT: addi sp, sp, -16
; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
; CHECK-RV32-NEXT: csrr a1, vlenb
-; CHECK-RV32-NEXT: slli a1, a1, 5
+; CHECK-RV32-NEXT: li a2, 24
+; CHECK-RV32-NEXT: mul a1, a1, a2
; CHECK-RV32-NEXT: sub sp, sp, a1
-; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-RV32-NEXT: csrr a1, vlenb
; CHECK-RV32-NEXT: slli a1, a1, 4
; CHECK-RV32-NEXT: add a1, sp, a1
; CHECK-RV32-NEXT: addi a1, a1, 16
; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-RV32-NEXT: vcpop.m a1, v0
-; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-RV32-NEXT: viota.m v16, v0
+; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-RV32-NEXT: vle64.v v24, (a0)
-; CHECK-RV32-NEXT: csrr a1, vlenb
-; CHECK-RV32-NEXT: li a2, 24
-; CHECK-RV32-NEXT: mul a1, a1, a2
-; CHECK-RV32-NEXT: add a1, sp, a1
-; CHECK-RV32-NEXT: addi a1, a1, 16
-; CHECK-RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu
+; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT: addi a1, sp, 16
+; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-RV32-NEXT: vmv.x.s a1, v0
; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 2
; CHECK-RV32-NEXT: zext.h a1, a1
; CHECK-RV32-NEXT: cpop a1, a1
; CHECK-RV32-NEXT: slli a1, a1, 3
; CHECK-RV32-NEXT: add a0, a0, a1
; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-RV32-NEXT: vcpop.m a1, v7
+; CHECK-RV32-NEXT: vcpop.m a1, v0
; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-RV32-NEXT: vle64.v v16, (a0)
; CHECK-RV32-NEXT: csrr a0, vlenb
@@ -1364,18 +1299,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
; CHECK-RV32-NEXT: addi a0, a0, 16
; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu
-; CHECK-RV32-NEXT: viota.m v24, v0
-; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: li a1, 24
-; CHECK-RV32-NEXT: mul a0, a0, a1
-; CHECK-RV32-NEXT: add a0, sp, a0
-; CHECK-RV32-NEXT: addi a0, a0, 16
-; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT: addi a0, sp, 16
-; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT: viota.m v8, v7
-; CHECK-RV32-NEXT: vmv1r.v v0, v7
+; CHECK-RV32-NEXT: viota.m v8, v0
; CHECK-RV32-NEXT: csrr a0, vlenb
; CHECK-RV32-NEXT: slli a0, a0, 4
; CHECK-RV32-NEXT: add a0, sp, a0
@@ -1390,7 +1314,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
; CHECK-RV32-NEXT: addi a0, sp, 16
; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV32-NEXT: csrr a0, vlenb
-; CHECK-RV32-NEXT: slli a0, a0, 5
+; CHECK-RV32-NEXT: li a1, 24
+; CHECK-RV32-NEXT: mul a0, a0, a1
; CHECK-RV32-NEXT: add sp, sp, a0
; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16
; CHECK-RV32-NEXT: addi sp, sp, 16
@@ -1402,33 +1327,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
; CHECK-RV64-NEXT: addi sp, sp, -16
; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
; CHECK-RV64-NEXT: csrr a1, vlenb
-; CHECK-RV64-NEXT: slli a1, a1, 5
+; CHECK-RV64-NEXT: li a2, 24
+; CHECK-RV64-NEXT: mul a1, a1, a2
; CHECK-RV64-NEXT: sub sp, sp, a1
-; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-RV64-NEXT: csrr a1, vlenb
; CHECK-RV64-NEXT: slli a1, a1, 4
; CHECK-RV64-NEXT: add a1, sp, a1
; CHECK-RV64-NEXT: addi a1, a1, 16
; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-RV64-NEXT: vcpop.m a1, v0
-; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-RV64-NEXT: viota.m v16, v0
+; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-RV64-NEXT: vle64.v v24, (a0)
-; CHECK-RV64-NEXT: csrr a1, vlenb
-; CHECK-RV64-NEXT: li a2, 24
-; CHECK-RV64-NEXT: mul a1, a1, a2
-; CHECK-RV64-NEXT: add a1, sp, a1
-; CHECK-RV64-NEXT: addi a1, a1, 16
-; CHECK-RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu
+; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT: addi a1, sp, 16
+; CHECK-RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-RV64-NEXT: vmv.x.s a1, v0
; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 2
; CHECK-RV64-NEXT: zext.h a1, a1
; CHECK-RV64-NEXT: cpopw a1, a1
; CHECK-RV64-NEXT: slli a1, a1, 3
; CHECK-RV64-NEXT: add a0, a0, a1
; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-RV64-NEXT: vcpop.m a1, v7
+; CHECK-RV64-NEXT: vcpop.m a1, v0
; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-RV64-NEXT: vle64.v v16, (a0)
; CHECK-RV64-NEXT: csrr a0, vlenb
@@ -1437,18 +1363,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
; CHECK-RV64-NEXT: addi a0, a0, 16
; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu
-; CHECK-RV64-NEXT: viota.m v24, v0
-; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: li a1, 24
-; CHECK-RV64-NEXT: mul a0, a0, a1
-; CHECK-RV64-NEXT: add a0, sp, a0
-; CHECK-RV64-NEXT: addi a0, a0, 16
-; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT: addi a0, sp, 16
-; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT: viota.m v8, v7
-; CHECK-RV64-NEXT: vmv1r.v v0, v7
+; CHECK-RV64-NEXT: viota.m v8, v0
; CHECK-RV64-NEXT: csrr a0, vlenb
; CHECK-RV64-NEXT: slli a0, a0, 4
; CHECK-RV64-NEXT: add a0, sp, a0
@@ -1463,7 +1378,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
; CHECK-RV64-NEXT: addi a0, sp, 16
; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-RV64-NEXT: csrr a0, vlenb
-; CHECK-RV64-NEXT: slli a0, a0, 5
+; CHECK-RV64-NEXT: li a1, 24
+; CHECK-RV64-NEXT: mul a0, a0, a1
; CHECK-RV64-NEXT: add sp, sp, a0
; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16
; CHECK-RV64-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll
index 425422417ec78f5..753a90c22a366a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll
@@ -9,10 +9,10 @@ define <512 x i8> @bitcast_1024B(<256 x i16> %a, <512 x i8> %b) {
; VLEN256-NEXT: addi a1, a0, 256
; VLEN256-NEXT: li a2, 256
; VLEN256-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; VLEN256-NEXT: vle8.v v24, (a0)
-; VLEN256-NEXT: vle8.v v0, (a1)
-; VLEN256-NEXT: vadd.vv v8, v24, v8
-; VLEN256-NEXT: vadd.vv v16, v0, v16
+; VLEN256-NEXT: vle8.v v24, (a1)
+; VLEN256-NEXT: vle8.v v0, (a0)
+; VLEN256-NEXT: vadd.vv v8, v0, v8
+; VLEN256-NEXT: vadd.vv v16, v24, v16
; VLEN256-NEXT: ret
;
; VLEN512-LABEL: bitcast_1024B:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 5ea4924468595d4..d86fcaf1df5f6ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1676,35 +1676,36 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
; RV32-NEXT: addi a3, a4, 819
; RV32-NEXT: sw a3, 32(sp)
; RV32-NEXT: sw a3, 36(sp)
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: lui a3, 4080
; RV32-NEXT: addi a4, a5, 1365
-; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a5, a6, -256
; RV32-NEXT: sw a4, 24(sp)
; RV32-NEXT: sw a4, 28(sp)
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT: addi a5, a6, -256
; RV32-NEXT: vand.vx v8, v8, a5, v0.t
; RV32-NEXT: vsll.vx v8, v8, a2, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 4
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 48
+; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a3, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a6, sp, 48
+; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a4), zero
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: slli a4, a4, 3
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 48
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vx v16, v24, a3, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
@@ -1739,14 +1740,14 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 40
; RV32-NEXT: addi a2, sp, 32
+; RV32-NEXT: addi a3, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -1761,7 +1762,7 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
@@ -1869,59 +1870,60 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v16, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: lui a4, 349525
; RV32-NEXT: li a5, 56
; RV32-NEXT: lui a6, 16
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a5
-; RV32-NEXT: vsrl.vx v24, v8, a5
+; RV32-NEXT: vsll.vx v8, v8, a5
+; RV32-NEXT: vsrl.vx v24, v16, a5
; RV32-NEXT: li a5, 40
+; RV32-NEXT: addi a6, a6, -256
+; RV32-NEXT: vsrl.vx v0, v16, a5
+; RV32-NEXT: vand.vx v0, v0, a6
+; RV32-NEXT: vor.vv v24, v0, v24
+; RV32-NEXT: addi a7, sp, 48
+; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v16, a6
+; RV32-NEXT: lui a6, 4080
+; RV32-NEXT: vsll.vx v0, v0, a5
+; RV32-NEXT: addi a5, sp, 16
+; RV32-NEXT: vor.vv v8, v8, v0
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 48
+; RV32-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v16, 24
; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: addi a1, a2, -241
+; RV32-NEXT: addi a2, a3, 819
+; RV32-NEXT: addi a3, a4, 1365
+; RV32-NEXT: vand.vx v0, v0, a6
; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: addi a2, a3, 819
; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: addi a3, a4, 1365
-; RV32-NEXT: addi a4, a6, -256
-; RV32-NEXT: vsrl.vx v0, v8, a5
; RV32-NEXT: sw a3, 24(sp)
; RV32-NEXT: sw a3, 28(sp)
-; RV32-NEXT: vand.vx v0, v0, a4
-; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a4
-; RV32-NEXT: vsll.vx v0, v0, a5
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vlse64.v v8, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a1
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a1
-; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v0, v8, v24
+; RV32-NEXT: vand.vv v24, v24, v8
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vand.vx v16, v16, a6
+; RV32-NEXT: vsll.vi v16, v16, 24
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v0, v16, v8
; RV32-NEXT: addi a1, sp, 48
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: addi a1, sp, 40
; RV32-NEXT: addi a2, sp, 32
; RV32-NEXT: csrr a3, vlenb
@@ -2072,35 +2074,36 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
; RV32-NEXT: addi a3, a4, 819
; RV32-NEXT: sw a3, 32(sp)
; RV32-NEXT: sw a3, 36(sp)
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: lui a3, 4080
; RV32-NEXT: addi a4, a5, 1365
-; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a5, a6, -256
; RV32-NEXT: sw a4, 24(sp)
; RV32-NEXT: sw a4, 28(sp)
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT: addi a5, a6, -256
; RV32-NEXT: vand.vx v8, v8, a5, v0.t
; RV32-NEXT: vsll.vx v8, v8, a2, v0.t
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 4
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 48
+; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v16, v24, a3, v0.t
+; RV32-NEXT: vsll.vi v8, v16, 24, v0.t
+; RV32-NEXT: addi a6, sp, 48
+; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a4), zero
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: slli a4, a4, 3
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 48
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vx v16, v24, a3, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v24, v8, v0.t
; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a4, sp, 48
; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
@@ -2135,14 +2138,14 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a1, sp, 40
; RV32-NEXT: addi a2, sp, 32
+; RV32-NEXT: addi a3, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 48
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
@@ -2157,7 +2160,7 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vi v24, v24, 2, v0.t
; RV32-NEXT: vor.vv v16, v16, v24, v0.t
@@ -2265,59 +2268,60 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v16, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: lui a4, 349525
; RV32-NEXT: li a5, 56
; RV32-NEXT: lui a6, 16
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a5
-; RV32-NEXT: vsrl.vx v24, v8, a5
+; RV32-NEXT: vsll.vx v8, v8, a5
+; RV32-NEXT: vsrl.vx v24, v16, a5
; RV32-NEXT: li a5, 40
+; RV32-NEXT: addi a6, a6, -256
+; RV32-NEXT: vsrl.vx v0, v16, a5
+; RV32-NEXT: vand.vx v0, v0, a6
+; RV32-NEXT: vor.vv v24, v0, v24
+; RV32-NEXT: addi a7, sp, 48
+; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v16, a6
+; RV32-NEXT: lui a6, 4080
+; RV32-NEXT: vsll.vx v0, v0, a5
+; RV32-NEXT: addi a5, sp, 16
+; RV32-NEXT: vor.vv v8, v8, v0
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 48
+; RV32-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v16, 24
; RV32-NEXT: sw a1, 16(sp)
; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: addi a1, a2, -241
+; RV32-NEXT: addi a2, a3, 819
+; RV32-NEXT: addi a3, a4, 1365
+; RV32-NEXT: vand.vx v0, v0, a6
; RV32-NEXT: sw a1, 40(sp)
; RV32-NEXT: sw a1, 44(sp)
-; RV32-NEXT: lui a1, 4080
-; RV32-NEXT: addi a2, a3, 819
; RV32-NEXT: sw a2, 32(sp)
; RV32-NEXT: sw a2, 36(sp)
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: addi a3, a4, 1365
-; RV32-NEXT: addi a4, a6, -256
-; RV32-NEXT: vsrl.vx v0, v8, a5
; RV32-NEXT: sw a3, 24(sp)
; RV32-NEXT: sw a3, 28(sp)
-; RV32-NEXT: vand.vx v0, v0, a4
-; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a4
-; RV32-NEXT: vsll.vx v0, v0, a5
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vlse64.v v8, (a5), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a1
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: vand.vx v8, v8, a1
-; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v0, v8, v24
+; RV32-NEXT: vand.vv v24, v24, v8
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v8, v16, v8
+; RV32-NEXT: vand.vx v16, v16, a6
+; RV32-NEXT: vsll.vi v16, v16, 24
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v0, v16, v8
; RV32-NEXT: addi a1, sp, 48
; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: addi a1, sp, 40
; RV32-NEXT: addi a2, sp, 32
; RV32-NEXT: csrr a3, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index d765e4c0b8f6a98..a4e78a750aa6410 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -768,61 +768,63 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v16, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
+; RV32-NEXT: vand.vx v24, v16, a1, v0.t
; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v8, v16, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 8, v0.t
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT: vsrl.vx v24, v16, a4, v0.t
; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a5, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
@@ -916,48 +918,48 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v24, v8, a2
-; RV32-NEXT: addi a1, a3, -256
; RV32-NEXT: vsrl.vx v16, v8, a2
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a1
+; RV32-NEXT: vand.vx v0, v0, a3
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a1
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v8, a3
; RV32-NEXT: vsll.vx v0, v0, a4
; RV32-NEXT: vor.vv v16, v24, v0
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a5
+; RV32-NEXT: vsrl.vi v24, v8, 8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a6), zero
+; RV32-NEXT: vlse64.v v16, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
@@ -1031,61 +1033,63 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vmv8r.v v16, v8
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: li a2, 56
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
-; RV32-NEXT: addi a5, sp, 8
+; RV32-NEXT: lui a5, 4080
+; RV32-NEXT: addi a6, sp, 8
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: sw zero, 12(sp)
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT: vsll.vx v8, v8, a2, v0.t
; RV32-NEXT: addi a1, a3, -256
-; RV32-NEXT: vand.vx v24, v8, a1, v0.t
+; RV32-NEXT: vand.vx v24, v16, a1, v0.t
; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
-; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vor.vv v8, v8, v24, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v8, v16, a5, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 24, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a5), zero
+; RV32-NEXT: vlse64.v v8, (a6), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vx v24, v8, a3, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: vand.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsll.vi v8, v24, 8, v0.t
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
-; RV32-NEXT: vsll.vi v16, v24, 8, v0.t
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT: vsrl.vx v24, v16, a4, v0.t
; RV32-NEXT: vand.vx v24, v24, a1, v0.t
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v8, v24, v8, v0.t
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT: vand.vx v24, v24, a3, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a5, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
@@ -1179,48 +1183,48 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: lui a3, 16
; RV32-NEXT: li a4, 40
; RV32-NEXT: lui a5, 4080
-; RV32-NEXT: addi a6, sp, 8
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsll.vx v24, v8, a2
-; RV32-NEXT: addi a1, a3, -256
; RV32-NEXT: vsrl.vx v16, v8, a2
+; RV32-NEXT: addi a2, sp, 8
+; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vsrl.vx v0, v8, a4
-; RV32-NEXT: vand.vx v0, v0, a1
+; RV32-NEXT: vand.vx v0, v0, a3
; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vand.vx v0, v8, a1
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: vand.vx v0, v8, a3
; RV32-NEXT: vsll.vx v0, v0, a4
; RV32-NEXT: vor.vv v16, v24, v0
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v8, 24
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: vand.vx v0, v0, a5
+; RV32-NEXT: vsrl.vi v24, v8, 8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a6), zero
+; RV32-NEXT: vlse64.v v16, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 24
-; RV32-NEXT: vand.vx v16, v16, a5
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v0
-; RV32-NEXT: vor.vv v16, v24, v16
-; RV32-NEXT: vand.vv v24, v8, v0
+; RV32-NEXT: vand.vv v24, v24, v16
+; RV32-NEXT: vor.vv v24, v24, v0
+; RV32-NEXT: vand.vv v16, v8, v16
; RV32-NEXT: vand.vx v8, v8, a5
; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vsll.vi v24, v24, 8
-; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vsll.vi v16, v16, 8
+; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v16, v24, v16
; RV32-NEXT: vor.vv v8, v8, v16
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
index 60a9948198c8f56..506d105fd4665f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
@@ -87,14 +87,14 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) {
; CHECK-NEXT: addi a2, a1, 256
; CHECK-NEXT: vle32.v v16, (a2)
; CHECK-NEXT: addi a2, a1, 384
-; CHECK-NEXT: vle32.v v24, (a1)
-; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v0, (a2)
-; CHECK-NEXT: addi a2, a0, 256
-; CHECK-NEXT: vse32.v v24, (a0)
+; CHECK-NEXT: vle32.v v24, (a2)
+; CHECK-NEXT: addi a2, a0, 384
+; CHECK-NEXT: vle32.v v0, (a1)
+; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vse32.v v0, (a0)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vse32.v v0, (a1)
-; CHECK-NEXT: vse32.v v16, (a2)
+; CHECK-NEXT: vse32.v v24, (a2)
+; CHECK-NEXT: vse32.v v16, (a1)
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%v = load <128 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
index f42b4a3a26aad03..9b2c54458acbf76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -87,14 +87,14 @@ define <128 x i32> @ret_split_v128i32(ptr %x) {
; CHECK-NEXT: addi a2, a1, 256
; CHECK-NEXT: vle32.v v16, (a2)
; CHECK-NEXT: addi a2, a1, 384
-; CHECK-NEXT: vle32.v v24, (a1)
-; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v0, (a2)
-; CHECK-NEXT: addi a2, a0, 256
-; CHECK-NEXT: vse32.v v24, (a0)
+; CHECK-NEXT: vle32.v v24, (a2)
+; CHECK-NEXT: addi a2, a0, 384
+; CHECK-NEXT: vle32.v v0, (a1)
+; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vse32.v v0, (a0)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vse32.v v0, (a1)
-; CHECK-NEXT: vse32.v v16, (a2)
+; CHECK-NEXT: vse32.v v24, (a2)
+; CHECK-NEXT: vse32.v v16, (a1)
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%v = load <128 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 9d0d42cf754c5e0..3a4d9e3c11de7e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1503,38 +1503,29 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -1547,57 +1538,34 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1775,38 +1743,29 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -1819,57 +1778,34 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -2055,7 +1991,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 48
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -2087,7 +2024,6 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: li a1, 32
; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -2102,34 +2038,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a3, a3, a5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a4), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
@@ -2137,38 +2064,41 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -2180,61 +2110,37 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a3, a3, a5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -2244,7 +2150,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -2266,18 +2173,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -2290,41 +2199,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2332,21 +2225,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -2550,80 +2442,63 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v0, v8
+; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 16
-; RV32-NEXT: vor.vv v16, v16, v8
+; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v24
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: vsub.vv v0, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v16, a2
-; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v24, v24, v8
-; RV32-NEXT: vadd.vv v24, v0, v24
+; RV32-NEXT: vand.vv v16, v0, v8
+; RV32-NEXT: vsrl.vi v0, v0, 2
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: vadd.vv v16, v16, v0
; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vsrl.vi v0, v16, 1
+; RV32-NEXT: vnot.v v24, v24
+; RV32-NEXT: vsrl.vi v0, v24, 1
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
; RV32-NEXT: addi a2, sp, 24
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsub.vv v0, v16, v0
+; RV32-NEXT: vsub.vv v24, v24, v0
; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v8, v0, v8
+; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v8, v24, v8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vlse64.v v24, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vlse64.v v0, (a3), zero
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v24, v8, v0
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v8, v16, a2
@@ -4213,38 +4088,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4257,57 +4123,34 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -4483,38 +4326,29 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctlz_zero_undef_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4527,57 +4361,34 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vor.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -4761,7 +4572,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 48
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -4793,7 +4605,6 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: li a1, 32
; RV32-NEXT: addi a3, sp, 40
-; RV32-NEXT: addi a4, sp, 32
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t
; RV32-NEXT: vor.vv v8, v8, v16, v0.t
@@ -4808,34 +4619,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a3, a3, a5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v8, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a4), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
@@ -4843,38 +4645,41 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 48
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -4886,61 +4691,37 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a3, a3, a5
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4950,7 +4731,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: and a0, a0, a3
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -4972,18 +4754,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -4996,41 +4780,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -5038,21 +4806,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -5256,80 +5023,63 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v0, v8
+; RV32-NEXT: vnot.v v8, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v16, 16
-; RV32-NEXT: vor.vv v16, v16, v8
+; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v0, 1
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v24
+; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vor.vv v24, v16, v8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v24, v0, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v16, a2
-; RV32-NEXT: vor.vv v16, v16, v0
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
-; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v24, v24, v8
-; RV32-NEXT: vadd.vv v24, v0, v24
+; RV32-NEXT: vand.vv v16, v0, v8
+; RV32-NEXT: vsrl.vi v0, v0, 2
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: vadd.vv v16, v16, v0
; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v16, v16
-; RV32-NEXT: vsrl.vi v0, v16, 1
+; RV32-NEXT: vnot.v v24, v24
+; RV32-NEXT: vsrl.vi v0, v24, 1
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v0, v0, v24
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v0, v0, v16
; RV32-NEXT: addi a2, sp, 24
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsub.vv v0, v16, v0
+; RV32-NEXT: vsub.vv v24, v24, v0
; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v0, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v8, v0, v8
+; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vsrl.vi v24, v24, 2
+; RV32-NEXT: vand.vv v8, v24, v8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
+; RV32-NEXT: vlse64.v v24, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a3), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vlse64.v v0, (a3), zero
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v0
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v24, v8, v0
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v8, v16, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index a5a106184242749..d853d04093335fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1119,70 +1119,55 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32)
define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: vand.vv v24, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1318,70 +1303,55 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32)
define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: vand.vv v24, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
; RV32-NEXT: vand.vv v24, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1525,7 +1495,8 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 48
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -1553,87 +1524,89 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB34_2:
-; RV32-NEXT: addi a2, sp, 40
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: addi a2, sp, 32
+; RV32-NEXT: addi a2, sp, 40
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a2), zero
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: slli a2, a2, 4
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 32
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a2), zero
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
+; RV32-NEXT: li a3, 40
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 48
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: addi a2, sp, 48
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: csrr a2, vlenb
@@ -1647,32 +1620,39 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -1681,6 +1661,17 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -1792,12 +1783,9 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi sp, sp, -48
; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv8r.v v24, v16
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a1, a1, 1365
@@ -1821,107 +1809,76 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: li a1, 16
; RV32-NEXT: .LBB35_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 1
+; RV32-NEXT: vsrl.vi v24, v8, 1
; RV32-NEXT: addi a2, sp, 40
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a2), zero
+; RV32-NEXT: vlse64.v v8, (a2), zero
; RV32-NEXT: addi a2, a0, -16
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: addi a2, sp, 32
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v0, v16
+; RV32-NEXT: vand.vv v24, v24, v8
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v0, v16, 1
+; RV32-NEXT: vand.vv v0, v0, v8
+; RV32-NEXT: addi a2, sp, 32
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsub.vv v8, v8, v24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v0, (a2), zero
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vmv8r.v v8, v24
+; RV32-NEXT: vlse64.v v24, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v24, 1
-; RV32-NEXT: vand.vv v16, v24, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v16, v24, v16
+; RV32-NEXT: vand.vv v0, v8, v24
; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: vand.vv v0, v16, v24
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v16, v16, 2
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v0
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v0
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v8, v0
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: addi a3, sp, 48
+; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v8, v16
+; RV32-NEXT: vadd.vv v24, v24, v8
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a2), zero
; RV32-NEXT: addi a2, sp, 16
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v24, v0
+; RV32-NEXT: vadd.vv v16, v0, v16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v0, v24, 4
+; RV32-NEXT: vadd.vv v24, v24, v0
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v0
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a2), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v24, 4
-; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v24, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1936,8 +1893,7 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v24, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 48
; RV32-NEXT: addi sp, sp, 48
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index cd4b19f11d1602f..3896bb29de988ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1263,91 +1263,59 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32)
define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1495,91 +1463,59 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32)
define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -1730,17 +1666,17 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: addi sp, sp, -48
; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 48
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 48
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
+; RV32-NEXT: vslidedown.vi v24, v0, 2
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a1, a1, 1365
@@ -1770,7 +1706,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
+; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 48
@@ -1778,94 +1714,108 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
+; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a3), zero
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -1873,71 +1823,88 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v24, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v24, 2, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 48
@@ -2044,29 +2011,25 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 16(sp)
+; RV32-NEXT: sw a2, 20(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: li a3, 16
; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a3, .LBB35_2
; RV32-NEXT: # %bb.1:
@@ -2075,19 +2038,20 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: li a2, 1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: addi a3, sp, 40
+; RV32-NEXT: addi a3, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a3), zero
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v8, v8, a2
; RV32-NEXT: vand.vv v8, v0, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v0, v16, a2
+; RV32-NEXT: addi a2, sp, 8
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2099,8 +2063,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vand.vv v24, v8, v24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: mv a3, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2115,19 +2078,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v24, 4
; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -2141,11 +2099,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vx v8, v16, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v24, a2
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -3453,91 +3407,59 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v15i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -3683,91 +3605,59 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v16i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: li a1, 1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
-; RV32-NEXT: sw a1, 32(sp)
-; RV32-NEXT: sw a1, 36(sp)
+; RV32-NEXT: sw a1, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: lui a1, 4112
; RV32-NEXT: addi a1, a1, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
-; RV32-NEXT: addi a1, sp, 40
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: addi a1, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT: vand.vv v24, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v24, (a1), zero
-; RV32-NEXT: addi a1, sp, 48
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: addi a1, sp, 8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, sp, 24
-; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT: vand.vv v24, v24, v8, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a1), zero
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: mv a1, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v16, (a1), zero
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 48
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
@@ -3916,17 +3806,17 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: addi sp, sp, -48
; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 48
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 48
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
+; RV32-NEXT: vslidedown.vi v24, v0, 2
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a1, a1, 1365
@@ -3956,7 +3846,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vnot.v v8, v8, v0.t
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
+; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 48
@@ -3964,94 +3854,108 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 40
+; RV32-NEXT: li a4, 24
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, sp, 32
-; RV32-NEXT: vlse64.v v8, (a3), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a3), zero
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 48
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: addi a3, sp, 24
-; RV32-NEXT: addi a4, sp, 16
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v16, (a3), zero
-; RV32-NEXT: addi a3, sp, 48
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vlse64.v v8, (a4), zero
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 24
+; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a3), zero
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 24
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 48
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4059,71 +3963,88 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: slli a3, a3, 5
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 48
-; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: vnot.v v16, v16, v0.t
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vsub.vx v16, v8, a1, v0.t
+; RV32-NEXT: vnot.v v8, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v24, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v24, v8, v0.t
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v24, 2, v0.t
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: addi a0, sp, 48
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 48
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
+; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 48
@@ -4230,29 +4151,25 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a1, a1, 1365
-; RV32-NEXT: sw a1, 40(sp)
-; RV32-NEXT: sw a1, 44(sp)
+; RV32-NEXT: sw a1, 24(sp)
+; RV32-NEXT: sw a1, 28(sp)
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: sw a2, 32(sp)
-; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw a2, 16(sp)
+; RV32-NEXT: sw a2, 20(sp)
; RV32-NEXT: lui a2, 4112
; RV32-NEXT: addi a1, a1, -241
-; RV32-NEXT: sw a1, 24(sp)
-; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
; RV32-NEXT: li a3, 16
; RV32-NEXT: addi a1, a2, 257
-; RV32-NEXT: sw a1, 16(sp)
-; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: mv a1, a0
; RV32-NEXT: bltu a0, a3, .LBB71_2
; RV32-NEXT: # %bb.1:
@@ -4261,19 +4178,20 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: li a2, 1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vnot.v v0, v8
-; RV32-NEXT: addi a3, sp, 40
+; RV32-NEXT: addi a3, sp, 24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a3), zero
; RV32-NEXT: addi a3, a0, -16
; RV32-NEXT: sltu a0, a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v8, v8, a2
; RV32-NEXT: vand.vv v8, v0, v8
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vx v0, v16, a2
+; RV32-NEXT: addi a2, sp, 8
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -4285,8 +4203,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vand.vv v24, v8, v24
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v8, (a3), zero
-; RV32-NEXT: addi a2, sp, 24
-; RV32-NEXT: addi a3, sp, 16
+; RV32-NEXT: mv a3, sp
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -4301,19 +4218,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v16, v24, 4
; RV32-NEXT: vadd.vv v16, v24, v16
-; RV32-NEXT: addi a4, sp, 48
-; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v24, (a2), zero
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v8, v0, v8
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vlse64.v v0, (a3), zero
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: addi a2, sp, 48
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -4327,11 +4239,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vx v8, v16, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v24, a2
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
-; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 4f11e6c3c386a2d..95cc3ee860976a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -587,7 +587,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v25, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
@@ -601,29 +601,29 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v7, v0, 2
-; CHECK-NEXT: bltu a2, a1, .LBB24_2
+; CHECK-NEXT: bltu a2, a3, .LBB24_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB24_2:
; CHECK-NEXT: vmv1r.v v0, v25
-; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: mul a0, a0, a3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t
; CHECK-NEXT: vmv1r.v v0, v26
; CHECK-NEXT: vmv8r.v v8, v16
@@ -680,10 +680,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t
; CHECK-NEXT: vmv1r.v v0, v25
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v24, v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
@@ -718,13 +718,13 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 2e2103ad5e06da1..e695819104bf61a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -587,7 +587,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v25, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
@@ -601,29 +601,29 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v7, v0, 2
-; CHECK-NEXT: bltu a2, a1, .LBB24_2
+; CHECK-NEXT: bltu a2, a3, .LBB24_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB24_2:
; CHECK-NEXT: vmv1r.v v0, v25
-; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: mul a0, a0, a3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t
; CHECK-NEXT: vmv1r.v v0, v26
; CHECK-NEXT: vmv8r.v v8, v16
@@ -680,10 +680,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t
; CHECK-NEXT: vmv1r.v v0, v25
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v24, v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
@@ -718,13 +718,13 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 0c7d7925edf39c2..4914d6393e530d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -203,67 +203,73 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vle32.v v8, (a4)
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 68
-; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: addi a5, a5, 3
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vle16.v v6, (a6)
-; RV32-NEXT: vmv.s.x v0, a5
; RV32-NEXT: lui a1, %hi(.LCPI8_1)
; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1)
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v24, v16, v6
+; RV32-NEXT: vle32.v v24, (a4)
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 52
-; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: li t1, 60
+; RV32-NEXT: mul a4, a4, t1
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v16, (a3)
+; RV32-NEXT: addi a5, a5, 3
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle16.v v8, (a6)
+; RV32-NEXT: vmv.s.x v7, a7
; RV32-NEXT: addi t0, t0, 12
-; RV32-NEXT: vmv.s.x v0, a7
-; RV32-NEXT: vmv.s.x v7, t0
-; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v4, (a1)
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v24, v16, 16
+; RV32-NEXT: vmv.s.x v0, a5
+; RV32-NEXT: vmv.s.x v3, t0
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v16, v24, v16, v0
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vrgatherei16.vv v24, v16, v8
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 60
-; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: li a4, 52
+; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vle32.v v8, (a3)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 84
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 68
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v20, v24, v16, v0
+; RV32-NEXT: vmerge.vvm v8, v16, v8, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 40
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 76
+; RV32-NEXT: li a3, 60
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 76
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v24, v8, v16, v0
+; RV32-NEXT: vmerge.vvm v24, v16, v8, v0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v8, v24, v4
; RV32-NEXT: csrr a1, vlenb
@@ -279,45 +285,44 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: slli a1, a1, 10
; RV32-NEXT: addi a3, a3, 48
; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vle16.v v14, (a4)
-; RV32-NEXT: vmv.s.x v12, a3
+; RV32-NEXT: vle16.v v4, (a4)
+; RV32-NEXT: vmv.s.x v7, a3
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 84
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vmv4r.v v8, v24
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 60
+; RV32-NEXT: li a3, 68
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 24
+; RV32-NEXT: li a3, 28
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v12
+; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 68
+; RV32-NEXT: li a3, 76
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v24, v24, v16, v0
+; RV32-NEXT: vmerge.vvm v24, v16, v24, v0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v16, v24, v14
+; RV32-NEXT: vrgatherei16.vv v8, v24, v4
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, 3
; RV32-NEXT: lui a3, 786624
; RV32-NEXT: lui a4, 12
@@ -330,20 +335,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a4, a4, 12
; RV32-NEXT: addi a5, a5, 768
; RV32-NEXT: addi a7, a7, -1024
-; RV32-NEXT: vmv.s.x v1, a6
-; RV32-NEXT: vmv.s.x v12, t0
+; RV32-NEXT: vmv.s.x v7, a6
+; RV32-NEXT: vmv.s.x v1, t0
; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vmv.s.x v3, a3
-; RV32-NEXT: vmv.s.x v2, a4
-; RV32-NEXT: vmv.s.x v13, a5
-; RV32-NEXT: vmv.s.x v14, a7
+; RV32-NEXT: vmv.s.x v8, a3
+; RV32-NEXT: vmv.s.x v6, a4
+; RV32-NEXT: vmv.s.x v3, a5
+; RV32-NEXT: vmv.s.x v2, a7
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 60
+; RV32-NEXT: li a3, 68
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vmv4r.v v8, v16
+; RV32-NEXT: vmv4r.v v24, v16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 84
; RV32-NEXT: mul a1, a1, a3
@@ -351,103 +356,108 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v20, v8, v16, v0
+; RV32-NEXT: vmerge.vvm v12, v24, v16, v0
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v3
+; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v8
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 68
+; RV32-NEXT: li a3, 60
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 76
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v24, v16, v24, v0
+; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v2
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v6
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 84
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v24, v8, v24, v0
+; RV32-NEXT: vmerge.vvm v16, v24, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 12
-; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v13
+; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 76
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v24, v16, v24, v0
+; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v1
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a3, 84
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 68
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v4, v8, v24, v0
+; RV32-NEXT: vmerge.vvm v20, v24, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 28
+; RV32-NEXT: li a3, 24
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v14
+; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v2
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 76
+; RV32-NEXT: li a3, 60
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 68
+; RV32-NEXT: li a3, 76
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v16, v24, v16, v0
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 76
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv1r.v v0, v12
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v1
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 84
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 68
; RV32-NEXT: mul a1, a1, a2
@@ -485,17 +495,17 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a2, a2, %lo(.LCPI8_3)
; RV32-NEXT: addi a1, a1, 5
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT: vle16.v v28, (a2)
+; RV32-NEXT: vle16.v v24, (a2)
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v20, a1
+; RV32-NEXT: vmv.v.x v25, a1
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
+; RV32-NEXT: li a2, 28
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v8, v12, v20
+; RV32-NEXT: vrgatherei16.vv v8, v12, v25
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 44
; RV32-NEXT: mul a1, a1, a2
@@ -505,118 +515,109 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 52
+; RV32-NEXT: li a2, 60
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v24, v12, v28
+; RV32-NEXT: vrgatherei16.vv v12, v8, v24
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v24, v16
+; RV32-NEXT: vmv.v.v v12, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 52
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
; RV32-NEXT: lui a1, %hi(.LCPI8_4)
; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4)
; RV32-NEXT: lui a2, %hi(.LCPI8_5)
; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vle16.v v12, (a1)
+; RV32-NEXT: vle16.v v26, (a1)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT: vle16.v v28, (a2)
+; RV32-NEXT: vle16.v v24, (a2)
; RV32-NEXT: lui a1, %hi(.LCPI8_6)
; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6)
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle16.v v30, (a1)
+; RV32-NEXT: vle16.v v2, (a1)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v16, v0, v12
+; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vrgatherei16.vv v16, v8, v26
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 12
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v12, v20, v28
+; RV32-NEXT: vrgatherei16.vv v12, v8, v24
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
; RV32-NEXT: vmv.v.v v12, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v16, v0, v30
+; RV32-NEXT: vrgatherei16.vv v24, v16, v2
; RV32-NEXT: lui a1, %hi(.LCPI8_7)
; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7)
; RV32-NEXT: lui a2, %hi(.LCPI8_8)
; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT: vle16.v v20, (a1)
+; RV32-NEXT: vle16.v v16, (a1)
; RV32-NEXT: lui a1, %hi(.LCPI8_9)
; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vle16.v v8, (a2)
+; RV32-NEXT: vle16.v v18, (a2)
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle16.v v10, (a1)
+; RV32-NEXT: vle16.v v17, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 28
+; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: vrgatherei16.vv v28, v0, v20
+; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vrgatherei16.vv v20, v28, v16
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v28, v16
+; RV32-NEXT: vmv.v.v v20, v24
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 76
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v16, v0, v8
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 60
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vrgatherei16.vv v0, v24, v18
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 68
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v16, v4, v10
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 60
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vrgatherei16.vv v28, v24, v17
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v16, v0
+; RV32-NEXT: vmv.v.v v28, v0
; RV32-NEXT: addi a1, a0, 320
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v16, (a1)
-; RV32-NEXT: addi a1, a0, 256
; RV32-NEXT: vse32.v v28, (a1)
+; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vse32.v v20, (a1)
; RV32-NEXT: addi a1, a0, 192
; RV32-NEXT: vse32.v v12, (a1)
; RV32-NEXT: addi a1, a0, 128
-; RV32-NEXT: vse32.v v24, (a1)
-; RV32-NEXT: addi a1, a0, 64
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 52
; RV32-NEXT: mul a2, a2, a3
@@ -624,6 +625,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
+; RV32-NEXT: addi a1, a0, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 60
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 84
; RV32-NEXT: mul a1, a1, a2
@@ -645,162 +654,171 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 88
+; RV64-NEXT: li a3, 96
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb
-; RV64-NEXT: addi a3, a1, 128
-; RV64-NEXT: addi a6, a1, 256
-; RV64-NEXT: li a4, 128
-; RV64-NEXT: lui a2, 1
-; RV64-NEXT: lui a5, %hi(.LCPI8_0)
-; RV64-NEXT: addi a5, a5, %lo(.LCPI8_0)
+; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb
+; RV64-NEXT: addi a2, a1, 256
+; RV64-NEXT: li a3, 128
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64-NEXT: vmv.v.i v16, 6
+; RV64-NEXT: lui a4, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v8, (a6)
-; RV64-NEXT: lui a6, 16
-; RV64-NEXT: addi a6, a6, 7
+; RV64-NEXT: vle64.v v8, (a2)
+; RV64-NEXT: vmv.s.x v0, a3
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: addi a4, a4, 7
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v17, a6
-; RV64-NEXT: addi a6, a2, 65
+; RV64-NEXT: vmv.v.x v17, a4
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vrgather.vi v24, v8, 4
-; RV64-NEXT: vrgather.vi v20, v8, 5
-; RV64-NEXT: csrr a7, vlenb
-; RV64-NEXT: li t0, 68
-; RV64-NEXT: mul a7, a7, t0
-; RV64-NEXT: add a7, sp, a7
-; RV64-NEXT: addi a7, a7, 16
-; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: vrgatherei16.vv v20, v8, v16
-; RV64-NEXT: csrr a7, vlenb
-; RV64-NEXT: li t0, 84
-; RV64-NEXT: mul a7, a7, t0
-; RV64-NEXT: add a7, sp, a7
-; RV64-NEXT: addi a7, a7, 16
-; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: vrgatherei16.vv v20, v8, v17
-; RV64-NEXT: csrr a7, vlenb
-; RV64-NEXT: li t0, 72
-; RV64-NEXT: mul a7, a7, t0
-; RV64-NEXT: add a7, sp, a7
-; RV64-NEXT: addi a7, a7, 16
-; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: vrgather.vi v20, v8, 4
+; RV64-NEXT: vrgather.vi v24, v8, 5
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 92
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v24, v8, v16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 80
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vrgatherei16.vv v24, v8, v17
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 72
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vrgather.vi v16, v8, 2
-; RV64-NEXT: csrr a7, vlenb
-; RV64-NEXT: slli a7, a7, 6
-; RV64-NEXT: add a7, sp, a7
-; RV64-NEXT: addi a7, a7, 16
-; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 88
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs4r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vrgather.vi v16, v8, 3
-; RV64-NEXT: csrr a7, vlenb
-; RV64-NEXT: li t0, 56
-; RV64-NEXT: mul a7, a7, t0
-; RV64-NEXT: add a7, sp, a7
-; RV64-NEXT: addi a7, a7, 16
-; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 84
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs4r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v8, 8
-; RV64-NEXT: csrr a7, vlenb
-; RV64-NEXT: li t0, 48
-; RV64-NEXT: mul a7, a7, t0
-; RV64-NEXT: add a7, sp, a7
-; RV64-NEXT: addi a7, a7, 16
-; RV64-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill
-; RV64-NEXT: vmv.s.x v21, a4
+; RV64-NEXT: vslidedown.vi v8, v8, 8
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 6
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: vrgather.vi v20, v8, 2, v0.t
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 76
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: addi a3, a1, 128
+; RV64-NEXT: lui a2, 1
+; RV64-NEXT: lui a4, %hi(.LCPI8_0)
+; RV64-NEXT: addi a4, a4, %lo(.LCPI8_0)
+; RV64-NEXT: addi a5, a2, 65
+; RV64-NEXT: vmv.s.x v0, a5
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vle16.v v8, (a4)
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: li a5, 24
+; RV64-NEXT: mul a4, a4, a5
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill
; RV64-NEXT: vle64.v v8, (a1)
-; RV64-NEXT: vle64.v v0, (a3)
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 40
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vle16.v v2, (a5)
-; RV64-NEXT: vmv.s.x v20, a6
-; RV64-NEXT: vmv1r.v v0, v21
-; RV64-NEXT: vmv1r.v v7, v21
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v24, v16, 2, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 60
-; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: li a4, 56
+; RV64-NEXT: mul a1, a1, a4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v20
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vle64.v v16, (a3)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 40
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
-; RV64-NEXT: vmv8r.v v16, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 76
+; RV64-NEXT: li a3, 24
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vrgatherei16.vv v8, v24, v2
+; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vrgatherei16.vv v0, v24, v16
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 5
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill
; RV64-NEXT: lui a1, 2
; RV64-NEXT: lui a3, %hi(.LCPI8_1)
; RV64-NEXT: addi a3, a3, %lo(.LCPI8_1)
; RV64-NEXT: addi a1, a1, 130
-; RV64-NEXT: vle16.v v8, (a3)
+; RV64-NEXT: vle16.v v16, (a3)
; RV64-NEXT: csrr a3, vlenb
; RV64-NEXT: slli a3, a3, 4
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 16
-; RV64-NEXT: vs2r.v v8, (a3) # Unknown-size Folded Spill
-; RV64-NEXT: vmv.s.x v2, a1
-; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs1r.v v7, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs2r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT: vmv.s.x v24, a1
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 68
+; RV64-NEXT: li a3, 40
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 48
+; RV64-NEXT: li a3, 92
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v24, v8, 3, v0.t
+; RV64-NEXT: vrgather.vi v28, v16, 3, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 68
+; RV64-NEXT: li a3, 92
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v2
+; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 40
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vrgatherei16.vv v0, v24, v16
+; RV64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vrgatherei16.vv v0, v24, v8
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a3, 24
; RV64-NEXT: mul a1, a1, a3
@@ -814,95 +832,93 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vmv.s.x v0, a1
; RV64-NEXT: vmv.s.x v2, a3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 76
+; RV64-NEXT: li a3, 56
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v24, v16, v8, v0
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl1r.v v7, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 84
+; RV64-NEXT: li a3, 40
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv1r.v v7, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 48
+; RV64-NEXT: li a3, 80
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv4r.v v8, v24
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v24, v16, 4, v0.t
+; RV64-NEXT: vrgather.vi v12, v24, 4, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 84
+; RV64-NEXT: li a3, 80
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
; RV64-NEXT: vmv1r.v v0, v2
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 76
+; RV64-NEXT: li a3, 56
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vmerge.vvm v24, v8, v16, v0
+; RV64-NEXT: vmerge.vvm v24, v16, v24, v0
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv8r.v v16, v8
; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a3, 72
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 48
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmv4r.v v8, v24
+; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgather.vi v12, v24, 5, v0.t
+; RV64-NEXT: vrgather.vi v4, v8, 5, v0.t
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a3, 72
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
; RV64-NEXT: lui a1, 96
; RV64-NEXT: li a3, 192
-; RV64-NEXT: vmv.s.x v3, a3
+; RV64-NEXT: vmv.s.x v1, a3
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v12, a1
-; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: vmv.v.x v3, a1
+; RV64-NEXT: vmv1r.v v0, v1
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: li a3, 88
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgatherei16.vv v24, v8, v12, v0.t
+; RV64-NEXT: vrgatherei16.vv v4, v8, v3, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: li a3, 88
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
; RV64-NEXT: lui a1, %hi(.LCPI8_2)
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2)
; RV64-NEXT: li a3, 1040
@@ -910,34 +926,48 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a4, a4, 1
; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v12, a4
+; RV64-NEXT: vmv.v.x v24, a4
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle16.v v6, (a1)
-; RV64-NEXT: vmv8r.v v24, v16
+; RV64-NEXT: vle16.v v2, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 76
+; RV64-NEXT: li a3, 56
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmerge.vvm v16, v24, v16, v0
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 56
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 40
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a3, 84
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrgatherei16.vv v16, v8, v12, v0.t
+; RV64-NEXT: vrgatherei16.vv v4, v8, v24, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a3, 56
+; RV64-NEXT: li a3, 84
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill
; RV64-NEXT: addi a1, a2, -2016
; RV64-NEXT: vmv.s.x v0, a1
; RV64-NEXT: csrr a1, vlenb
@@ -946,82 +976,91 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v16, v8, v6
+; RV64-NEXT: vrgatherei16.vv v24, v8, v2
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 76
+; RV64-NEXT: li a2, 48
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 76
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: slli a1, a1, 6
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: lui a1, %hi(.LCPI8_3)
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3)
-; RV64-NEXT: vle16.v v24, (a1)
+; RV64-NEXT: vle16.v v8, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 5
+; RV64-NEXT: li a2, 76
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 60
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: slli a1, a1, 5
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v8, v0
+; RV64-NEXT: vmv.v.v v12, v24
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 60
+; RV64-NEXT: li a2, 76
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 68
+; RV64-NEXT: li a2, 92
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 24
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vmv.v.v v0, v8
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv.v.v v12, v24
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 84
+; RV64-NEXT: li a2, 92
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v8, v16
+; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 84
+; RV64-NEXT: li a2, 80
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
+; RV64-NEXT: vmv.v.v v16, v24
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v16, v8, v24
+; RV64-NEXT: vrgatherei16.vv v0, v24, v8
; RV64-NEXT: lui a1, %hi(.LCPI8_4)
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4)
; RV64-NEXT: vle16.v v8, (a1)
; RV64-NEXT: lui a1, %hi(.LCPI8_5)
; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5)
-; RV64-NEXT: vle16.v v6, (a1)
+; RV64-NEXT: vle16.v v10, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs2r.v v10, (a1) # Unknown-size Folded Spill
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 72
; RV64-NEXT: mul a1, a1, a2
@@ -1029,60 +1068,70 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v12, v16
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vmv.v.v v12, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 40
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v24, v16, v8
+; RV64-NEXT: vrgatherei16.vv v24, v0, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: li a2, 88
+; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
; RV64-NEXT: vmv.v.v v8, v24
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 76
+; RV64-NEXT: slli a1, a1, 6
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 80
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl2r.v v20, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v24, v16, v6
+; RV64-NEXT: vrgatherei16.vv v24, v0, v20
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 56
+; RV64-NEXT: li a2, 84
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v16, v24
+; RV64-NEXT: vmv.v.v v28, v24
; RV64-NEXT: addi a1, a0, 256
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-NEXT: vse64.v v8, (a1)
; RV64-NEXT: addi a1, a0, 320
-; RV64-NEXT: vse64.v v16, (a1)
+; RV64-NEXT: vse64.v v28, (a1)
; RV64-NEXT: addi a1, a0, 192
; RV64-NEXT: vse64.v v12, (a1)
; RV64-NEXT: addi a1, a0, 128
+; RV64-NEXT: vse64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 64
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 84
+; RV64-NEXT: li a3, 92
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload
; RV64-NEXT: vse64.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 64
-; RV64-NEXT: vse64.v v0, (a1)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 60
+; RV64-NEXT: li a2, 76
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a1, 88
+; RV64-NEXT: li a1, 96
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 2b279389253b01a..1cab7370e55ba20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1344,17 +1344,17 @@ define float @vreduce_fmin_v128f32(ptr %x) {
; CHECK-LABEL: vreduce_fmin_v128f32:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: addi a2, a0, 384
+; CHECK-NEXT: addi a2, a0, 256
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v8, (a2)
-; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: vle32.v v16, (a0)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle32.v v24, (a0)
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vfmin.vv v8, v24, v8
-; CHECK-NEXT: vfmin.vv v16, v16, v0
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vle32.v v0, (a0)
+; CHECK-NEXT: vfmin.vv v24, v0, v24
; CHECK-NEXT: vfmin.vv v8, v16, v8
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
@@ -1591,17 +1591,17 @@ define float @vreduce_fmax_v128f32(ptr %x) {
; CHECK-LABEL: vreduce_fmax_v128f32:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: addi a2, a0, 384
+; CHECK-NEXT: addi a2, a0, 256
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v8, (a2)
-; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: vle32.v v16, (a0)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle32.v v24, (a0)
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vfmax.vv v8, v24, v8
-; CHECK-NEXT: vfmax.vv v16, v16, v0
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vle32.v v0, (a0)
+; CHECK-NEXT: vfmax.vv v24, v0, v24
; CHECK-NEXT: vfmax.vv v8, v16, v8
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
@@ -1997,57 +1997,64 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: addi a2, a0, 128
+; CHECK-NEXT: addi a2, a0, 384
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a2)
-; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vle32.v v16, (a2)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vle32.v v24, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v0
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v16, v8
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v24, v8, v24
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v16, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
@@ -2077,17 +2084,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) {
; CHECK-LABEL: vreduce_fminimum_v128f32_nonans:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: addi a2, a0, 384
+; CHECK-NEXT: addi a2, a0, 256
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v8, (a2)
-; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: vle32.v v16, (a0)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle32.v v24, (a0)
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vfmin.vv v8, v24, v8
-; CHECK-NEXT: vfmin.vv v16, v16, v0
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vle32.v v0, (a0)
+; CHECK-NEXT: vfmin.vv v24, v0, v24
; CHECK-NEXT: vfmin.vv v8, v16, v8
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
@@ -2314,57 +2321,64 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 384
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v0
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v16, v8
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v24, v8, v24
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v16, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
@@ -2395,15 +2409,15 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: vle64.v v0, (a1)
-; CHECK-NEXT: vfmin.vv v16, v24, v16
-; CHECK-NEXT: vfmin.vv v8, v8, v0
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vle64.v v0, (a0)
+; CHECK-NEXT: vfmin.vv v24, v0, v24
; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
@@ -2711,57 +2725,64 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: addi a2, a0, 128
+; CHECK-NEXT: addi a2, a0, 384
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a2)
-; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vle32.v v16, (a2)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vle32.v v24, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v0
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v16, v8
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v24, v8, v24
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v16, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
@@ -2791,17 +2812,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) {
; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: addi a2, a0, 384
+; CHECK-NEXT: addi a2, a0, 256
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v8, (a2)
-; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: vle32.v v16, (a0)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle32.v v24, (a0)
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vfmax.vv v8, v24, v8
-; CHECK-NEXT: vfmax.vv v16, v16, v0
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vle32.v v0, (a0)
+; CHECK-NEXT: vfmax.vv v24, v0, v24
; CHECK-NEXT: vfmax.vv v8, v16, v8
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
@@ -3028,57 +3049,64 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 384
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v0
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v16, v8
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v24, v8, v24
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v16, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
@@ -3109,15 +3137,15 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 256
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: vle64.v v0, (a1)
-; CHECK-NEXT: vfmax.vv v16, v24, v16
-; CHECK-NEXT: vfmax.vv v8, v8, v0
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vle64.v v0, (a0)
+; CHECK-NEXT: vfmax.vv v24, v0, v24
; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 707d1202aca0f98..84dda70d042a46e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -1471,14 +1471,14 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
define i64 @vreduce_add_v64i64(ptr %x) nounwind {
; RV32-LABEL: vreduce_add_v64i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle64.v v24, (a1)
-; RV32-NEXT: addi a1, a0, 128
-; RV32-NEXT: vle64.v v0, (a1)
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 256
-; RV32-NEXT: vle64.v v16, (a0)
+; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
+; RV32-NEXT: addi a0, a0, 128
+; RV32-NEXT: vle64.v v24, (a1)
+; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vadd.vv v24, v0, v24
; RV32-NEXT: vmv.s.x v7, zero
; RV32-NEXT: li a1, 32
@@ -1495,15 +1495,15 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vadd.vv v16, v24, v16
-; RV64-NEXT: vadd.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vadd.vv v24, v0, v24
; RV64-NEXT: vadd.vv v8, v8, v16
+; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vmv.s.x v16, zero
; RV64-NEXT: vredsum.vs v8, v8, v16
; RV64-NEXT: vmv.x.s a0, v8
@@ -1550,15 +1550,15 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: li a2, 32
; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: vle32.v v16, (a1)
+; RV64-NEXT: vle32.v v8, (a1)
+; RV64-NEXT: vle32.v v16, (a0)
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT: vslidedown.vi v24, v8, 16
+; RV64-NEXT: vslidedown.vi v24, v16, 16
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV64-NEXT: vslidedown.vi v0, v16, 16
+; RV64-NEXT: vslidedown.vi v0, v8, 16
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT: vwadd.vv v24, v8, v16
+; RV64-NEXT: vwadd.vv v24, v16, v8
; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vwadd.vv v8, v16, v0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1616,15 +1616,15 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: li a2, 32
; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: vle32.v v16, (a1)
+; RV64-NEXT: vle32.v v8, (a1)
+; RV64-NEXT: vle32.v v16, (a0)
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT: vslidedown.vi v24, v8, 16
+; RV64-NEXT: vslidedown.vi v24, v16, 16
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV64-NEXT: vslidedown.vi v0, v16, 16
+; RV64-NEXT: vslidedown.vi v0, v8, 16
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT: vwaddu.vv v24, v8, v16
+; RV64-NEXT: vwaddu.vv v24, v16, v8
; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vwaddu.vv v8, v16, v0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -2201,16 +2201,16 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vand.vv v16, v0, v16
-; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vand.vv v24, v0, v24
; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vredand.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -2222,15 +2222,15 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vand.vv v16, v24, v16
-; RV64-NEXT: vand.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vand.vv v24, v0, v24
; RV64-NEXT: vand.vv v8, v8, v16
+; RV64-NEXT: vand.vv v8, v8, v24
; RV64-NEXT: vredand.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -2793,16 +2793,16 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vor.vv v24, v0, v24
; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vor.vv v8, v8, v24
; RV32-NEXT: vredor.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -2814,15 +2814,15 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vor.vv v16, v24, v16
-; RV64-NEXT: vor.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vor.vv v24, v0, v24
; RV64-NEXT: vor.vv v8, v8, v16
+; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vredor.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -3414,14 +3414,14 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
; RV32-LABEL: vreduce_xor_v64i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle64.v v24, (a1)
-; RV32-NEXT: addi a1, a0, 128
-; RV32-NEXT: vle64.v v0, (a1)
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a0, a0, 256
-; RV32-NEXT: vle64.v v16, (a0)
+; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
+; RV32-NEXT: addi a0, a0, 128
+; RV32-NEXT: vle64.v v24, (a1)
+; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vxor.vv v24, v0, v24
; RV32-NEXT: vmv.s.x v7, zero
; RV32-NEXT: li a1, 32
@@ -3438,15 +3438,15 @@ define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vxor.vv v16, v24, v16
-; RV64-NEXT: vxor.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vxor.vv v24, v0, v24
; RV64-NEXT: vxor.vv v8, v8, v16
+; RV64-NEXT: vxor.vv v8, v8, v24
; RV64-NEXT: vmv.s.x v16, zero
; RV64-NEXT: vredxor.vs v8, v8, v16
; RV64-NEXT: vmv.x.s a0, v8
@@ -4011,16 +4011,16 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vmin.vv v16, v0, v16
-; RV32-NEXT: vmin.vv v8, v8, v24
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vmin.vv v24, v0, v24
; RV32-NEXT: vmin.vv v8, v8, v16
+; RV32-NEXT: vmin.vv v8, v8, v24
; RV32-NEXT: vredmin.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -4032,15 +4032,15 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vmin.vv v16, v24, v16
-; RV64-NEXT: vmin.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vmin.vv v24, v0, v24
; RV64-NEXT: vmin.vv v8, v8, v16
+; RV64-NEXT: vmin.vv v8, v8, v24
; RV64-NEXT: vredmin.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -4604,16 +4604,16 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vmax.vv v16, v0, v16
-; RV32-NEXT: vmax.vv v8, v8, v24
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vmax.vv v24, v0, v24
; RV32-NEXT: vmax.vv v8, v8, v16
+; RV32-NEXT: vmax.vv v8, v8, v24
; RV32-NEXT: vredmax.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -4625,15 +4625,15 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vmax.vv v16, v24, v16
-; RV64-NEXT: vmax.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vmax.vv v24, v0, v24
; RV64-NEXT: vmax.vv v8, v8, v16
+; RV64-NEXT: vmax.vv v8, v8, v24
; RV64-NEXT: vredmax.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -5197,16 +5197,16 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vminu.vv v16, v0, v16
-; RV32-NEXT: vminu.vv v8, v8, v24
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vminu.vv v24, v0, v24
; RV32-NEXT: vminu.vv v8, v8, v16
+; RV32-NEXT: vminu.vv v8, v8, v24
; RV32-NEXT: vredminu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -5218,15 +5218,15 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vminu.vv v16, v24, v16
-; RV64-NEXT: vminu.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vminu.vv v24, v0, v24
; RV64-NEXT: vminu.vv v8, v8, v16
+; RV64-NEXT: vminu.vv v8, v8, v24
; RV64-NEXT: vredminu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -5789,16 +5789,16 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v0, (a0)
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vmaxu.vv v16, v0, v16
-; RV32-NEXT: vmaxu.vv v8, v8, v24
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vmaxu.vv v24, v0, v24
; RV32-NEXT: vmaxu.vv v8, v8, v16
+; RV32-NEXT: vmaxu.vv v8, v8, v24
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
@@ -5810,15 +5810,15 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vmaxu.vv v16, v24, v16
-; RV64-NEXT: vmaxu.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vmaxu.vv v24, v0, v24
; RV64-NEXT: vmaxu.vv v8, v8, v16
+; RV64-NEXT: vmaxu.vv v8, v8, v24
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -6585,15 +6585,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: vle64.v v24, (a0)
-; RV32-NEXT: vle64.v v0, (a1)
-; RV32-NEXT: vmul.vv v16, v24, v16
-; RV32-NEXT: vmul.vv v8, v8, v0
+; RV32-NEXT: vle64.v v24, (a1)
+; RV32-NEXT: vle64.v v0, (a0)
+; RV32-NEXT: vmul.vv v24, v0, v24
; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vslidedown.vi v16, v8, 8
; RV32-NEXT: vmul.vv v8, v8, v16
; RV32-NEXT: vslidedown.vi v16, v8, 4
@@ -6612,15 +6612,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: addi a1, a0, 384
; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: vle64.v v0, (a1)
-; RV64-NEXT: vmul.vv v16, v24, v16
-; RV64-NEXT: vmul.vv v8, v8, v0
+; RV64-NEXT: vle64.v v24, (a1)
+; RV64-NEXT: vle64.v v0, (a0)
+; RV64-NEXT: vmul.vv v24, v0, v24
; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vmul.vv v8, v8, v24
; RV64-NEXT: vslidedown.vi v16, v8, 8
; RV64-NEXT: vmul.vv v8, v8, v16
; RV64-NEXT: vslidedown.vi v16, v8, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
index 318f38839851c33..77c5d1b3bf64183 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
@@ -149,18 +149,17 @@ define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32>
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; CHECK-NEXT: vmv8r.v v16, v8
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vslidedown.vi v7, v0, 4
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vle32.v v24, (a0)
; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v0, v0, 4
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: vadd.vv v8, v16, v8
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vrsub.vi v24, v24, 0, v0.t
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 03d5762b4903efb..c026b8427578b9f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1073,19 +1073,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; ZVFH-NEXT: addi a1, a0, 128
; ZVFH-NEXT: li a3, 64
+; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; ZVFH-NEXT: vslidedown.vi v24, v0, 8
; ZVFH-NEXT: vsetvli zero, a3, e16, m8, ta, ma
; ZVFH-NEXT: vle16.v v16, (a1)
; ZVFH-NEXT: addi a1, sp, 16
; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT: mv a1, a2
; ZVFH-NEXT: vle16.v v16, (a0)
-; ZVFH-NEXT: mv a0, a2
-; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; ZVFH-NEXT: vslidedown.vi v24, v0, 8
; ZVFH-NEXT: bltu a2, a3, .LBB43_2
; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: li a0, 64
+; ZVFH-NEXT: li a1, 64
; ZVFH-NEXT: .LBB43_2:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v7, v8, v16, v0.t
; ZVFH-NEXT: addi a0, a2, -64
; ZVFH-NEXT: sltu a1, a2, a0
@@ -1152,14 +1152,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN32-NEXT: addi a3, sp, 640
; ZVFHMIN32-NEXT: addi a4, sp, 384
; ZVFHMIN32-NEXT: addi a5, sp, 512
+; ZVFHMIN32-NEXT: addi a6, sp, 256
; ZVFHMIN32-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; ZVFHMIN32-NEXT: vle16.v v0, (a0)
-; ZVFHMIN32-NEXT: addi a0, sp, 256
; ZVFHMIN32-NEXT: vle16.v v24, (a1)
+; ZVFHMIN32-NEXT: vle16.v v0, (a0)
; ZVFHMIN32-NEXT: vse16.v v8, (a3)
; ZVFHMIN32-NEXT: vse16.v v0, (a4)
; ZVFHMIN32-NEXT: vse16.v v16, (a5)
-; ZVFHMIN32-NEXT: vse16.v v24, (a0)
+; ZVFHMIN32-NEXT: vse16.v v24, (a6)
; ZVFHMIN32-NEXT: lh a0, 704(sp)
; ZVFHMIN32-NEXT: lh a1, 448(sp)
; ZVFHMIN32-NEXT: fmv.h.x fa5, a0
@@ -2286,14 +2286,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFHMIN64-NEXT: addi a3, sp, 640
; ZVFHMIN64-NEXT: addi a4, sp, 384
; ZVFHMIN64-NEXT: addi a5, sp, 512
+; ZVFHMIN64-NEXT: addi a6, sp, 256
; ZVFHMIN64-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; ZVFHMIN64-NEXT: vle16.v v0, (a0)
-; ZVFHMIN64-NEXT: addi a0, sp, 256
; ZVFHMIN64-NEXT: vle16.v v24, (a1)
+; ZVFHMIN64-NEXT: vle16.v v0, (a0)
; ZVFHMIN64-NEXT: vse16.v v8, (a3)
; ZVFHMIN64-NEXT: vse16.v v0, (a4)
; ZVFHMIN64-NEXT: vse16.v v16, (a5)
-; ZVFHMIN64-NEXT: vse16.v v24, (a0)
+; ZVFHMIN64-NEXT: vse16.v v24, (a6)
; ZVFHMIN64-NEXT: lh a0, 704(sp)
; ZVFHMIN64-NEXT: lh a1, 448(sp)
; ZVFHMIN64-NEXT: fmv.h.x fa5, a0
@@ -3953,20 +3953,20 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v24, v0, 2
+; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v0, 2
-; CHECK-NEXT: bltu a2, a1, .LBB87_2
+; CHECK-NEXT: bltu a2, a3, .LBB87_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB87_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v7, v8, v16, v0.t
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
@@ -3977,13 +3977,13 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vslideup.vi v7, v8, 2
+; CHECK-NEXT: vslideup.vi v7, v16, 2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index 69d6ffa9f300c07..56d94dd55696bd7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -599,31 +599,31 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: li a1, 128
-; CHECK-NEXT: addi a4, a0, 128
+; CHECK-NEXT: addi a4, a3, -128
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a2)
-; CHECK-NEXT: addi a2, a3, -128
-; CHECK-NEXT: vle8.v v8, (a4)
-; CHECK-NEXT: sltu a4, a3, a2
-; CHECK-NEXT: vle8.v v24, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: sltu a2, a3, a4
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a2, a2, a4
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT: vmseq.vv v7, v16, v8, v0.t
; CHECK-NEXT: bltu a3, a1, .LBB51_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
@@ -634,7 +634,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t
; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: vmv1r.v v8, v6
+; CHECK-NEXT: vmv1r.v v8, v7
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
@@ -1263,19 +1263,19 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: li a3, 32
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v24, v0, 4
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v16, (a1)
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v0, 4
; CHECK-NEXT: bltu a2, a3, .LBB99_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: li a1, 32
; CHECK-NEXT: .LBB99_2:
-; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vv v7, v8, v16, v0.t
; CHECK-NEXT: addi a0, a2, -32
; CHECK-NEXT: sltu a1, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 4b7f82f94f5e4d4..9e887dfa8dc06b8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -636,16 +636,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: add a5, a1, a5
; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v24, (a5), a2, v0.t
+; CHECK-RV32-NEXT: addi a3, a0, 128
+; CHECK-RV32-NEXT: addi a5, a0, 256
; CHECK-RV32-NEXT: vmv1r.v v0, v8
; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t
-; CHECK-RV32-NEXT: addi a1, a0, 128
-; CHECK-RV32-NEXT: addi a2, a0, 256
; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-RV32-NEXT: vse64.v v8, (a0)
-; CHECK-RV32-NEXT: vse64.v v24, (a1)
+; CHECK-RV32-NEXT: vse64.v v24, (a3)
; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT: vse64.v v16, (a2)
+; CHECK-RV32-NEXT: vse64.v v16, (a5)
; CHECK-RV32-NEXT: ret
;
; CHECK-RV64-LABEL: strided_load_v33f64:
@@ -687,16 +687,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: add a5, a1, a5
; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-RV64-NEXT: vlse64.v v24, (a5), a2, v0.t
+; CHECK-RV64-NEXT: addi a4, a0, 128
+; CHECK-RV64-NEXT: addi a5, a0, 256
; CHECK-RV64-NEXT: vmv1r.v v0, v8
; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t
-; CHECK-RV64-NEXT: addi a1, a0, 128
-; CHECK-RV64-NEXT: addi a2, a0, 256
; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-RV64-NEXT: vse64.v v8, (a0)
-; CHECK-RV64-NEXT: vse64.v v24, (a1)
+; CHECK-RV64-NEXT: vse64.v v24, (a4)
; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT: vse64.v v16, (a2)
+; CHECK-RV64-NEXT: vse64.v v16, (a5)
; CHECK-RV64-NEXT: ret
%v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
ret <33 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index a91dee1cb245f91..6444968a04a8379 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -245,64 +245,62 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vslidedown.vi v6, v0, 8
-; CHECK-NEXT: addi a2, a1, 512
-; CHECK-NEXT: addi a3, a1, 640
-; CHECK-NEXT: addi a4, a7, -64
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vi v26, v0, 4
+; CHECK-NEXT: addi a3, a1, 128
+; CHECK-NEXT: addi a2, a1, 640
+; CHECK-NEXT: addi a4, a7, -64
; CHECK-NEXT: vslidedown.vi v27, v6, 4
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a7, a4
+; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: sltu a2, a7, a4
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v27, 2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a4, a3, a4
-; CHECK-NEXT: addi a3, a4, -32
-; CHECK-NEXT: sltu a5, a4, a3
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a4, a2, a4
+; CHECK-NEXT: addi a2, a4, -32
+; CHECK-NEXT: sltu a5, a4, a2
; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a3, a5, a3
-; CHECK-NEXT: addi a5, a3, -16
-; CHECK-NEXT: sltu a6, a3, a5
+; CHECK-NEXT: and a5, a5, a2
+; CHECK-NEXT: addi a2, a5, -16
+; CHECK-NEXT: sltu a6, a5, a2
; CHECK-NEXT: addi a6, a6, -1
-; CHECK-NEXT: and a5, a6, a5
-; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma
+; CHECK-NEXT: and a2, a6, a2
+; CHECK-NEXT: addi a6, a1, 512
+; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 4
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a2)
-; CHECK-NEXT: addi a5, a1, 128
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: li a2, 16
-; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vi v26, v7, 4
-; CHECK-NEXT: bltu a3, a2, .LBB16_2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a6)
+; CHECK-NEXT: bltu a5, a2, .LBB16_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: li a5, 16
; CHECK-NEXT: .LBB16_2:
; CHECK-NEXT: vmv1r.v v0, v27
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a5)
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li a6, 56
-; CHECK-NEXT: mul a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v27, v26, 2
-; CHECK-NEXT: li a5, 64
-; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: vle64.v v16, (a3)
; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 6
+; CHECK-NEXT: li a6, 56
+; CHECK-NEXT: mul a3, a3, a6
; CHECK-NEXT: add a3, sp, a3
; CHECK-NEXT: addi a3, a3, 16
; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v27, v26, 2
+; CHECK-NEXT: li a3, 64
+; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma
+; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; CHECK-NEXT: mv a6, a7
-; CHECK-NEXT: bltu a7, a5, .LBB16_4
+; CHECK-NEXT: bltu a7, a3, .LBB16_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: li a6, 64
; CHECK-NEXT: .LBB16_4:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index fa82065f3b4131e..0b32818dba15fc1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -298,46 +298,36 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v0, 2
-; CHECK-NEXT: bltu a2, a1, .LBB26_2
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: bltu a2, a3, .LBB26_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB26_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfsgnj.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
; CHECK-NEXT: addi a1, a1, -1
; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfsgnj.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -352,8 +342,8 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: vle64.v v0, (a0)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB27_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index bde842dcc7600fb..a6c51ced93ddc07 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -849,35 +849,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a2, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a2)
-; CHECK-NEXT: addi a2, a0, 128
-; CHECK-NEXT: vle64.v v8, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v8, (a2)
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a4
+; CHECK-NEXT: addi a3, a2, 128
+; CHECK-NEXT: addi a5, a0, 128
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v7, v0, 2
-; CHECK-NEXT: bltu a4, a1, .LBB50_2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: mv a1, a4
+; CHECK-NEXT: vle64.v v8, (a3)
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v8, (a5)
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: bltu a4, a2, .LBB50_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB50_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
@@ -893,16 +893,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
; CHECK-NEXT: vmv.v.v v16, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
@@ -941,26 +941,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a2, 128
+; CHECK-NEXT: addi a3, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
-; CHECK-NEXT: addi a2, a0, 128
+; CHECK-NEXT: li a2, 16
; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v24, (a2)
+; CHECK-NEXT: mv a1, a4
+; CHECK-NEXT: vle64.v v24, (a3)
; CHECK-NEXT: vle64.v v0, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a4
-; CHECK-NEXT: bltu a4, a1, .LBB51_2
+; CHECK-NEXT: bltu a4, a2, .LBB51_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v0, v8, v16
; CHECK-NEXT: addi a0, a4, -16
; CHECK-NEXT: sltu a1, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index b37c47a32ba21e4..16043d33fb4df2f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -390,46 +390,36 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v0, 2
-; CHECK-NEXT: bltu a2, a1, .LBB26_2
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: bltu a2, a3, .LBB26_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB26_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v24, v0.t
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
; CHECK-NEXT: addi a1, a1, -1
; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -444,8 +434,8 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: vle64.v v0, (a0)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB27_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 261523e8ace500f..5a4a85ca32a707f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -390,46 +390,36 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v7, v0, 2
+; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v0, 2
-; CHECK-NEXT: bltu a2, a1, .LBB26_2
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: bltu a2, a3, .LBB26_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB26_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v24, v0.t
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
; CHECK-NEXT: addi a1, a1, -1
; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -444,8 +434,8 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: vle64.v v0, (a0)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: li a1, 16
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB27_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index a5d9b3439e29bd1..eb4ce757a83854f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -621,35 +621,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a2, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a2)
-; CHECK-NEXT: addi a2, a0, 128
-; CHECK-NEXT: vle64.v v8, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v8, (a2)
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a4
+; CHECK-NEXT: addi a3, a2, 128
+; CHECK-NEXT: addi a5, a0, 128
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v7, v0, 2
-; CHECK-NEXT: bltu a4, a1, .LBB50_2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: mv a1, a4
+; CHECK-NEXT: vle64.v v8, (a3)
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v8, (a5)
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: bltu a4, a2, .LBB50_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB50_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
@@ -665,16 +665,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
; CHECK-NEXT: vmv.v.v v16, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
@@ -713,26 +713,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a2, 128
+; CHECK-NEXT: addi a3, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
-; CHECK-NEXT: addi a2, a0, 128
+; CHECK-NEXT: li a2, 16
; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle64.v v24, (a2)
+; CHECK-NEXT: mv a1, a4
+; CHECK-NEXT: vle64.v v24, (a3)
; CHECK-NEXT: vle64.v v0, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a4
-; CHECK-NEXT: bltu a4, a1, .LBB51_2
+; CHECK-NEXT: bltu a4, a2, .LBB51_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v0, v8, v16
; CHECK-NEXT: addi a0, a4, -16
; CHECK-NEXT: sltu a1, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
index f2960b9600acaa6..fc893ae00585d9f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
@@ -99,8 +99,8 @@ define <64 x float> @vfwadd_v64f16(ptr %x, ptr %y) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
index 325302cb2bb8e09..350f5efa46a66f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
@@ -99,8 +99,8 @@ define <64 x float> @vfwmul_v64f16(ptr %x, ptr %y) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
index 599a3f90994034b..7c901494c033b72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
@@ -99,8 +99,8 @@ define <64 x float> @vfwsub_v64f16(ptr %x, ptr %y) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 6394542479d1b07..bb364b1d4bcf525 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1360,15 +1360,15 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vmv8r.v v16, v8
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB83_2
+; CHECK-NEXT: bltu a2, a3, .LBB83_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB83_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index d691dcd5c54b6f7..1dff4a3feeaab5f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1778,16 +1778,16 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV64-NEXT: addi a1, a0, 128
+; RV64-NEXT: li a3, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: mv a1, a2
; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: li a1, 16
-; RV64-NEXT: mv a0, a2
-; RV64-NEXT: bltu a2, a1, .LBB83_2
+; RV64-NEXT: bltu a2, a3, .LBB83_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 16
; RV64-NEXT: .LBB83_2:
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
; RV64-NEXT: addi a0, a2, -16
; RV64-NEXT: sltu a1, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll
index dc83edba5ae8cc9..c5506e175ce007b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll
@@ -8,104 +8,48 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a4, 48
-; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 5
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a4, 40
-; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a4, a3, 128
-; CHECK-NEXT: addi a5, a3, 384
+; CHECK-NEXT: addi a4, a3, 256
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a5)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a5, 24
-; CHECK-NEXT: mul a2, a2, a5
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a2, a1, 128
-; CHECK-NEXT: vle8.v v8, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a3, 256
-; CHECK-NEXT: vle8.v v8, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v8, (a2)
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle8.v v24, (a4)
+; CHECK-NEXT: addi a2, a3, 384
+; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: addi a1, a1, 128
+; CHECK-NEXT: vadd.vv v8, v0, v24
+; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vle8.v v0, (a2)
+; CHECK-NEXT: vle8.v v24, (a1)
+; CHECK-NEXT: vadd.vv v24, v24, v0
+; CHECK-NEXT: addi a1, a3, 128
+; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vadd.vv v16, v16, v0
; CHECK-NEXT: vle8.v v0, (a3)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vv v8, v8, v16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vv v16, v16, v8
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vadd.vv v24, v8, v24
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 40
-; CHECK-NEXT: mul a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vadd.vv v0, v8, v0
; CHECK-NEXT: vse8.v v0, (a0)
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vse8.v v16, (a1)
+; CHECK-NEXT: vse8.v v24, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vse8.v v8, (a1)
; CHECK-NEXT: addi a0, a0, 128
-; CHECK-NEXT: vse8.v v24, (a0)
+; CHECK-NEXT: vse8.v v16, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 48
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -121,10 +65,10 @@ define <512 x i8> @vadd_v512i8_zvl256(<512 x i8> %a, <512 x i8> %b) #1 {
; CHECK-NEXT: addi a1, a0, 256
; CHECK-NEXT: li a2, 256
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v24, (a0)
-; CHECK-NEXT: vle8.v v0, (a1)
-; CHECK-NEXT: vadd.vv v8, v8, v24
-; CHECK-NEXT: vadd.vv v16, v16, v0
+; CHECK-NEXT: vle8.v v24, (a1)
+; CHECK-NEXT: vle8.v v0, (a0)
+; CHECK-NEXT: vadd.vv v8, v8, v0
+; CHECK-NEXT: vadd.vv v16, v16, v24
; CHECK-NEXT: ret
%c = add <512 x i8> %a, %b
ret <512 x i8> %c
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 05254e60b65b74a..6e173b25adce97b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -167,15 +167,15 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
; CHECK-NEXT: vmv1r.v v6, v8
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a4, a1, 128
+; CHECK-NEXT: addi a4, a3, -128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v24, (a0)
-; CHECK-NEXT: addi a0, a3, -128
-; CHECK-NEXT: vle8.v v8, (a4)
-; CHECK-NEXT: sltu a4, a3, a0
+; CHECK-NEXT: sltu a0, a3, a4
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: vle8.v v16, (a1)
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a0, a4, a0
+; CHECK-NEXT: addi a1, a1, 128
+; CHECK-NEXT: vle8.v v8, (a1)
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
@@ -206,40 +206,30 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v16, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, a1, 128
+; CHECK-NEXT: vle8.v v16, (a1)
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a1, 128
; CHECK-NEXT: vle8.v v24, (a0)
; CHECK-NEXT: vle8.v v16, (a1)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -249,8 +239,7 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -418,16 +407,16 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB25_2
+; CHECK-NEXT: bltu a2, a3, .LBB25_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: .LBB25_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
@@ -456,39 +445,50 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v24, v0, 2
+; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -497,8 +497,7 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -621,13 +620,13 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: vle32.v v24, (a0)
-; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a3, .LBB35_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: li a1, 32
; CHECK-NEXT: .LBB35_2:
-; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
; CHECK-NEXT: addi a0, a2, -32
; CHECK-NEXT: sltu a1, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
index 50184796b38f534..ab44b864d68fa99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
@@ -257,8 +257,8 @@ define <128 x i16> @vwadd_v128i16(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -302,8 +302,8 @@ define <64 x i32> @vwadd_v64i32(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index 98f246b8741dcc8..01d38fe5daeab4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -257,8 +257,8 @@ define <128 x i16> @vwaddu_v128i16(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -302,8 +302,8 @@ define <64 x i32> @vwaddu_v64i32(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index eb7be14abe431d8..e92e3d554e494e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -283,8 +283,8 @@ define <128 x i16> @vwmul_v128i16(ptr %x, ptr %y) {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -332,8 +332,8 @@ define <64 x i32> @vwmul_v64i32(ptr %x, ptr %y) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 8626b25a9d323cb..a4b0623b0637361 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -275,8 +275,8 @@ define <128 x i16> @vwmulsu_v128i16(ptr %x, ptr %y) {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -324,8 +324,8 @@ define <64 x i32> @vwmulsu_v64i32(ptr %x, ptr %y) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 007b561a2247ada..1d5b255d5109892 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -259,8 +259,8 @@ define <128 x i16> @vwmulu_v128i16(ptr %x, ptr %y) {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -308,8 +308,8 @@ define <64 x i32> @vwmulu_v64i32(ptr %x, ptr %y) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
index 7a925165d981639..0654fbbd05f8c0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
@@ -257,8 +257,8 @@ define <128 x i16> @vwsub_v128i16(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -302,8 +302,8 @@ define <64 x i32> @vwsub_v64i32(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index 4c08a8c15a388e0..57f95877b1e3642 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -257,8 +257,8 @@ define <128 x i16> @vwsubu_v128i16(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vle8.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
@@ -302,8 +302,8 @@ define <64 x i32> @vwsubu_v64i32(ptr %x, ptr %y) nounwind {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vle16.v v0, (a1)
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vslidedown.vx v16, v8, a0
; CHECK-NEXT: vslidedown.vx v8, v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index d56e46f7db3ab3a..d981e5b3ce6ef16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -1615,26 +1615,24 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a3, a1, 3
-; CHECK-NEXT: srli a4, a1, 3
-; CHECK-NEXT: vslidedown.vx v6, v0, a4
+; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: sub a4, a2, a1
-; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v8, (a3)
+; CHECK-NEXT: vslidedown.vx v6, v0, a3
; CHECK-NEXT: sltu a3, a2, a4
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: slli a4, a1, 3
+; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT: vl8re64.v v8, (a4)
; CHECK-NEXT: vmv1r.v v0, v26
; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
; CHECK-NEXT: csrr a3, vlenb
@@ -1644,11 +1642,9 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t
-; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vmv1r.v v0, v26
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
@@ -1707,12 +1703,11 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1727,15 +1722,10 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: vfmax.vv v8, v16, v8
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1744,27 +1734,21 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v24, v16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 81e4a548f560e23..d2a9916b26e2296 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -1615,26 +1615,24 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a3, a1, 3
-; CHECK-NEXT: srli a4, a1, 3
-; CHECK-NEXT: vslidedown.vx v6, v0, a4
+; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: sub a4, a2, a1
-; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v8, (a3)
+; CHECK-NEXT: vslidedown.vx v6, v0, a3
; CHECK-NEXT: sltu a3, a2, a4
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: slli a4, a1, 3
+; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT: vl8re64.v v8, (a4)
; CHECK-NEXT: vmv1r.v v0, v26
; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
; CHECK-NEXT: csrr a3, vlenb
@@ -1644,11 +1642,9 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t
-; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vmv1r.v v0, v26
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
@@ -1707,12 +1703,11 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1727,15 +1722,10 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: vfmin.vv v8, v16, v8
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1744,27 +1734,21 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v24, v16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index b569efc7447da6b..f52200b4e7c34e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -984,10 +984,10 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 48
+; CHECK-NEXT: li a3, 40
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
@@ -1001,43 +1001,33 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a3, a1, 3
-; CHECK-NEXT: sub a5, a4, a1
-; CHECK-NEXT: add a6, a2, a3
-; CHECK-NEXT: vl8re64.v v8, (a6)
-; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: li a7, 40
-; CHECK-NEXT: mul a6, a6, a7
-; CHECK-NEXT: add a6, sp, a6
-; CHECK-NEXT: addi a6, a6, 16
-; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT: sltu a6, a4, a5
-; CHECK-NEXT: addi a6, a6, -1
-; CHECK-NEXT: and a5, a6, a5
-; CHECK-NEXT: srli a6, a1, 3
-; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v16, (a3)
; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v0, a6
-; CHECK-NEXT: li a3, 63
+; CHECK-NEXT: slli a5, a3, 3
+; CHECK-NEXT: srli a1, a3, 3
+; CHECK-NEXT: sub a6, a4, a3
+; CHECK-NEXT: vslidedown.vx v0, v0, a1
+; CHECK-NEXT: add a1, a2, a5
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: sltu a1, a4, a6
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a6, a1, a6
+; CHECK-NEXT: li a1, 63
+; CHECK-NEXT: add a5, a0, a5
+; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: li a7, 40
-; CHECK-NEXT: mul a6, a6, a7
+; CHECK-NEXT: slli a6, a6, 4
; CHECK-NEXT: add a6, sp, a6
; CHECK-NEXT: addi a6, a6, 16
-; CHECK-NEXT: vl8r.v v8, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT: vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v16, (a5)
; CHECK-NEXT: csrr a5, vlenb
; CHECK-NEXT: slli a5, a5, 3
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a5, vlenb
; CHECK-NEXT: slli a5, a5, 4
; CHECK-NEXT: add a5, sp, a5
@@ -1048,21 +1038,35 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT: vsrl.vv v16, v8, v16, v0.t
; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 3
+; CHECK-NEXT: slli a5, a5, 4
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li a6, 40
+; CHECK-NEXT: li a6, 24
; CHECK-NEXT: mul a5, a5, a6
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vnot.v v8, v8, v0.t
-; CHECK-NEXT: vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 4
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 24
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
@@ -1071,66 +1075,40 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a2)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 40
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsll.vi v16, v8, 1, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsll.vv v16, v16, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a4, a1, .LBB46_2
+; CHECK-NEXT: bltu a4, a3, .LBB46_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a4, a1
+; CHECK-NEXT: mv a4, a3
; CHECK-NEXT: .LBB46_2:
; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT: vand.vx v8, v16, a3, v0.t
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsrl.vv v8, v16, v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsrl.vv v16, v16, v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vnot.v v16, v8, v0.t
-; CHECK-NEXT: vand.vx v16, v16, a3, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add a0, sp, a0
@@ -1151,7 +1129,7 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 48
+; CHECK-NEXT: li a1, 40
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
@@ -1176,13 +1154,13 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1193,7 +1171,10 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: vslidedown.vx v0, v0, a1
; CHECK-NEXT: add a1, a2, a5
; CHECK-NEXT: vl8re64.v v8, (a1)
-; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: sltu a1, a4, a6
; CHECK-NEXT: addi a1, a1, -1
@@ -1207,7 +1188,8 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: addi a6, a6, 16
; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: slli a6, a6, 5
+; CHECK-NEXT: li a7, 24
+; CHECK-NEXT: mul a6, a6, a7
; CHECK-NEXT: add a6, sp, a6
; CHECK-NEXT: addi a6, a6, 16
; CHECK-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
@@ -1218,17 +1200,34 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: vl8r.v v8, (a6) # Unknown-size Folded Reload
; CHECK-NEXT: vsll.vv v16, v16, v8, v0.t
; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: slli a6, a6, 3
+; CHECK-NEXT: li a7, 24
+; CHECK-NEXT: mul a6, a6, a7
; CHECK-NEXT: add a6, sp, a6
; CHECK-NEXT: addi a6, a6, 16
; CHECK-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill
; CHECK-NEXT: add a5, a0, a5
-; CHECK-NEXT: addi a6, sp, 16
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 3
+; CHECK-NEXT: add a6, sp, a6
+; CHECK-NEXT: addi a6, a6, 16
; CHECK-NEXT: vl8r.v v8, (a6) # Unknown-size Folded Reload
; CHECK-NEXT: vnot.v v8, v8, v0.t
-; CHECK-NEXT: vl8re64.v v16, (a5)
-; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT: vl8re64.v v8, (a5)
+; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 24
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: li a6, 24
+; CHECK-NEXT: mul a5, a5, a6
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
@@ -1237,59 +1236,37 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a2)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vsrl.vi v16, v16, 1, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: bltu a4, a3, .LBB47_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a3
; CHECK-NEXT: .LBB47_2:
; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a0, a0, a2
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a0, a0, a2
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsll.vv v16, v16, v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vnot.v v8, v8, v0.t
; CHECK-NEXT: vand.vx v16, v8, a1, v0.t
@@ -1301,14 +1278,14 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t
; CHECK-NEXT: vsrl.vv v8, v8, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vor.vv v8, v16, v8, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 9ee2324f615dd8d..7415e26b52095ea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -1241,21 +1241,21 @@ define void @mgather_nxv16i64(<vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptr
; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; RV64-NEXT: vmv8r.v v16, v8
-; RV64-NEXT: vl8re64.v v24, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: vl8re64.v v8, (a1)
-; RV64-NEXT: srli a1, a0, 3
-; RV64-NEXT: vslidedown.vx v7, v0, a1
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vluxei64.v v24, (zero), v16, v0.t
-; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vl8re64.v v24, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a3, a1, 3
+; RV64-NEXT: vslidedown.vx v7, v0, a3
+; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu
; RV64-NEXT: vluxei64.v v8, (zero), v16, v0.t
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, a2, a0
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vs8r.v v24, (a2)
+; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vluxei64.v v24, (zero), v16, v0.t
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: vs8r.v v24, (a1)
+; RV64-NEXT: vs8r.v v8, (a2)
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 77a1f508d221842..6efb61f53614e16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1894,57 +1894,25 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 5
+; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 24
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: addi a2, sp, 16
; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 4
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vl8re64.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: vl8re64.v v8, (a1)
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: srli a0, a0, 3
-; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vx v24, v0, a0
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vl8re64.v v16, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a1, a1, 3
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vslidedown.vx v7, v0, a1
+; RV64-NEXT: vl8re64.v v24, (a0)
; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vsoxei64.v v16, (zero), v8, v0.t
-; RV64-NEXT: vmv1r.v v0, v24
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a1, 24
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
+; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vsoxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: .cfi_def_cfa sp, 16
; RV64-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
index 53d1666c30e9609..3101f78f882515c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
@@ -4,13 +4,13 @@
define <vscale x 16 x i32> @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 16 x i32> %w, <vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) {
; CHECK-LABEL: bar:
; CHECK: # %bb.0:
-; CHECK-NEXT: ld a0, 0(sp)
-; CHECK-NEXT: ld a1, 8(sp)
+; CHECK-NEXT: ld a0, 8(sp)
+; CHECK-NEXT: ld a1, 0(sp)
; CHECK-NEXT: vl8re32.v v24, (a0)
; CHECK-NEXT: vl8re32.v v0, (a1)
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT: vadd.vv v8, v8, v24
-; CHECK-NEXT: vadd.vv v16, v16, v0
+; CHECK-NEXT: vadd.vv v8, v8, v0
+; CHECK-NEXT: vadd.vv v16, v16, v24
; CHECK-NEXT: vadd.vv v8, v8, v16
; CHECK-NEXT: ret
%s0 = add <vscale x 16 x i32> %w, %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 30ef3dccd426b82..9924591257f6c1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1466,13 +1466,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: slli a3, a1, 5
+; CHECK-NEXT: add a1, a3, a1
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
@@ -1500,79 +1497,78 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: and a7, a7, a1
; CHECK-NEXT: srli a1, a3, 1
; CHECK-NEXT: srli a3, a3, 2
-; CHECK-NEXT: csrr t0, vlenb
-; CHECK-NEXT: slli t0, t0, 3
-; CHECK-NEXT: add t0, sp, t0
-; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: addi t0, sp, 16
; CHECK-NEXT: vs1r.v v24, (t0) # Unknown-size Folded Spill
; CHECK-NEXT: vslidedown.vx v25, v24, a1
; CHECK-NEXT: vsetvli t0, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v25, a3
-; CHECK-NEXT: vl8re16.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli t0, a0, 5
-; CHECK-NEXT: add a0, t0, a0
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli t0, a0, 3
-; CHECK-NEXT: add a0, t0, a0
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv4r.v v8, v0
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: add t0, sp, t0
+; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli zero, a7, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli t0, a7, 4
+; CHECK-NEXT: add a7, t0, a7
+; CHECK-NEXT: add a7, sp, a7
+; CHECK-NEXT: addi a7, a7, 16
+; CHECK-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli t0, a7, 3
+; CHECK-NEXT: add a7, t0, a7
+; CHECK-NEXT: add a7, sp, a7
+; CHECK-NEXT: addi a7, a7, 16
+; CHECK-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli t0, a7, 4
+; CHECK-NEXT: add a7, t0, a7
+; CHECK-NEXT: add a7, sp, a7
+; CHECK-NEXT: addi a7, a7, 16
+; CHECK-NEXT: vl8r.v v8, (a7) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v4, v16, v8, v0.t
+; CHECK-NEXT: vl8re16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a7, a0, 4
; CHECK-NEXT: add a0, a7, a0
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v6, v16, v8, v0.t
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: bltu a6, a4, .LBB85_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a6, a4
; CHECK-NEXT: .LBB85_2:
; CHECK-NEXT: vmv1r.v v0, v25
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a7, a0, 3
-; CHECK-NEXT: add a0, a7, a0
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a6, a0, 4
+; CHECK-NEXT: slli a6, a0, 3
; CHECK-NEXT: add a0, a6, a0
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v5, v24, v8, v0.t
+; CHECK-NEXT: vmfeq.vv v7, v8, v16, v0.t
; CHECK-NEXT: add a0, a3, a3
; CHECK-NEXT: bltu a2, a5, .LBB85_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a5
; CHECK-NEXT: .LBB85_4:
; CHECK-NEXT: sub a5, a2, a4
-; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: slli a6, a6, 3
-; CHECK-NEXT: add a6, sp, a6
-; CHECK-NEXT: addi a6, a6, 16
-; CHECK-NEXT: vl1r.v v7, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a6, sp, 16
+; CHECK-NEXT: vl1r.v v5, (a6) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v7, a3
+; CHECK-NEXT: vslidedown.vx v0, v5, a3
; CHECK-NEXT: sltu a6, a2, a5
; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: and a5, a6, a5
@@ -1584,60 +1580,54 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
; CHECK-NEXT: add a6, a6, a7
; CHECK-NEXT: add a6, sp, a6
; CHECK-NEXT: addi a6, a6, 16
-; CHECK-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v8, (a6) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a5, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
; CHECK-NEXT: csrr a5, vlenb
; CHECK-NEXT: slli a6, a5, 4
; CHECK-NEXT: add a5, a6, a5
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a6, a5, 5
-; CHECK-NEXT: add a5, a6, a5
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a6, a5, 4
-; CHECK-NEXT: add a5, a6, a5
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v4, v24, v8, v0.t
+; CHECK-NEXT: vmfeq.vv v6, v16, v8, v0.t
; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v5, v6, a3
+; CHECK-NEXT: vslideup.vx v7, v4, a3
; CHECK-NEXT: bltu a2, a4, .LBB85_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a2, a4
; CHECK-NEXT: .LBB85_6:
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmv1r.v v0, v5
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a4, a2, 5
+; CHECK-NEXT: slli a4, a2, 4
; CHECK-NEXT: add a2, a4, a2
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t
; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v4, a3
+; CHECK-NEXT: vslideup.vx v8, v6, a3
; CHECK-NEXT: add a0, a1, a1
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v5, a1
+; CHECK-NEXT: vslideup.vx v8, v7, a1
; CHECK-NEXT: vmv.v.v v0, v8
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a1, a1, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: slli a1, a0, 5
+; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -3728,38 +3718,38 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFH-NEXT: addi a1, a1, 16
; ZVFH-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; ZVFH-NEXT: csrr a3, vlenb
-; ZVFH-NEXT: srli a1, a3, 1
+; ZVFH-NEXT: vl8re16.v v8, (a0)
+; ZVFH-NEXT: addi a1, sp, 16
+; ZVFH-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; ZVFH-NEXT: slli a4, a3, 3
-; ZVFH-NEXT: slli a3, a3, 2
+; ZVFH-NEXT: slli a1, a3, 2
; ZVFH-NEXT: add a4, a0, a4
-; ZVFH-NEXT: sub a5, a2, a3
+; ZVFH-NEXT: sub a0, a2, a1
+; ZVFH-NEXT: sltu a5, a2, a0
+; ZVFH-NEXT: addi a5, a5, -1
+; ZVFH-NEXT: and a5, a5, a0
+; ZVFH-NEXT: srli a0, a3, 1
+; ZVFH-NEXT: vslidedown.vx v0, v0, a0
; ZVFH-NEXT: vl8re16.v v8, (a4)
-; ZVFH-NEXT: sltu a4, a2, a5
-; ZVFH-NEXT: addi a4, a4, -1
-; ZVFH-NEXT: vl8re16.v v0, (a0)
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vslidedown.vx v0, v24, a1
-; ZVFH-NEXT: and a4, a4, a5
-; ZVFH-NEXT: vsetvli zero, a4, e16, m8, ta, ma
+; ZVFH-NEXT: vsetvli zero, a5, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v7, v16, v8, v0.t
-; ZVFH-NEXT: bltu a2, a3, .LBB171_2
+; ZVFH-NEXT: bltu a2, a1, .LBB171_2
; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a2, a3
+; ZVFH-NEXT: mv a2, a1
; ZVFH-NEXT: .LBB171_2:
; ZVFH-NEXT: vmv1r.v v0, v24
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: csrr a1, vlenb
+; ZVFH-NEXT: slli a1, a1, 3
+; ZVFH-NEXT: add a1, sp, a1
+; ZVFH-NEXT: addi a1, a1, 16
+; ZVFH-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFH-NEXT: addi a1, sp, 16
+; ZVFH-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFH-NEXT: vsetvli zero, a2, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v16, v8, v24, v0.t
-; ZVFH-NEXT: add a0, a1, a1
-; ZVFH-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; ZVFH-NEXT: vslideup.vx v16, v7, a1
+; ZVFH-NEXT: add a1, a0, a0
+; ZVFH-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; ZVFH-NEXT: vslideup.vx v16, v7, a0
; ZVFH-NEXT: vmv.v.v v0, v16
; ZVFH-NEXT: csrr a0, vlenb
; ZVFH-NEXT: slli a0, a0, 4
@@ -3774,13 +3764,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a1, a1, a3
+; ZVFHMIN-NEXT: slli a3, a1, 5
+; ZVFHMIN-NEXT: add a1, a3, a1
; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v24, v0
; ZVFHMIN-NEXT: csrr a1, vlenb
@@ -3808,79 +3795,78 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: and a7, a7, a1
; ZVFHMIN-NEXT: srli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: csrr t0, vlenb
-; ZVFHMIN-NEXT: slli t0, t0, 3
-; ZVFHMIN-NEXT: add t0, sp, t0
-; ZVFHMIN-NEXT: addi t0, t0, 16
+; ZVFHMIN-NEXT: addi t0, sp, 16
; ZVFHMIN-NEXT: vs1r.v v24, (t0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vslidedown.vx v25, v24, a1
; ZVFHMIN-NEXT: vsetvli t0, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v24, v25, a3
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli t0, a0, 5
-; ZVFHMIN-NEXT: add a0, t0, a0
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli t0, a0, 3
-; ZVFHMIN-NEXT: add a0, t0, a0
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv4r.v v8, v0
+; ZVFHMIN-NEXT: csrr t0, vlenb
+; ZVFHMIN-NEXT: add t0, sp, t0
+; ZVFHMIN-NEXT: addi t0, t0, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (t0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, a7, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: csrr a7, vlenb
+; ZVFHMIN-NEXT: slli t0, a7, 4
+; ZVFHMIN-NEXT: add a7, t0, a7
+; ZVFHMIN-NEXT: add a7, sp, a7
+; ZVFHMIN-NEXT: addi a7, a7, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv8r.v v8, v16
+; ZVFHMIN-NEXT: csrr a7, vlenb
+; ZVFHMIN-NEXT: slli t0, a7, 3
+; ZVFHMIN-NEXT: add a7, t0, a7
+; ZVFHMIN-NEXT: add a7, sp, a7
+; ZVFHMIN-NEXT: addi a7, a7, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vmv1r.v v0, v24
+; ZVFHMIN-NEXT: csrr a7, vlenb
+; ZVFHMIN-NEXT: slli t0, a7, 4
+; ZVFHMIN-NEXT: add a7, t0, a7
+; ZVFHMIN-NEXT: add a7, sp, a7
+; ZVFHMIN-NEXT: addi a7, a7, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a7) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v4, v16, v8, v0.t
+; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a7, a0, 4
; ZVFHMIN-NEXT: add a0, a7, a0
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vmv1r.v v0, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: bltu a6, a4, .LBB171_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a6, a4
; ZVFHMIN-NEXT: .LBB171_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v25
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a7, a0, 3
-; ZVFHMIN-NEXT: add a0, a7, a0
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a6, a0, 4
+; ZVFHMIN-NEXT: slli a6, a0, 3
; ZVFHMIN-NEXT: add a0, a6, a0
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v5, v24, v8, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v7, v8, v16, v0.t
; ZVFHMIN-NEXT: add a0, a3, a3
; ZVFHMIN-NEXT: bltu a2, a5, .LBB171_4
; ZVFHMIN-NEXT: # %bb.3:
; ZVFHMIN-NEXT: mv a2, a5
; ZVFHMIN-NEXT: .LBB171_4:
; ZVFHMIN-NEXT: sub a5, a2, a4
-; ZVFHMIN-NEXT: csrr a6, vlenb
-; ZVFHMIN-NEXT: slli a6, a6, 3
-; ZVFHMIN-NEXT: add a6, sp, a6
-; ZVFHMIN-NEXT: addi a6, a6, 16
-; ZVFHMIN-NEXT: vl1r.v v7, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: addi a6, sp, 16
+; ZVFHMIN-NEXT: vl1r.v v5, (a6) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT: vslidedown.vx v0, v5, a3
; ZVFHMIN-NEXT: sltu a6, a2, a5
; ZVFHMIN-NEXT: addi a6, a6, -1
; ZVFHMIN-NEXT: and a5, a6, a5
@@ -3892,60 +3878,54 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: add a6, a6, a7
; ZVFHMIN-NEXT: add a6, sp, a6
; ZVFHMIN-NEXT: addi a6, a6, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a6) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a5, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a5, vlenb
; ZVFHMIN-NEXT: slli a6, a5, 4
; ZVFHMIN-NEXT: add a5, a6, a5
; ZVFHMIN-NEXT: add a5, sp, a5
; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a6, a5, 5
-; ZVFHMIN-NEXT: add a5, a6, a5
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a6, a5, 4
-; ZVFHMIN-NEXT: add a5, a6, a5
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v4, v24, v8, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v5, v6, a3
+; ZVFHMIN-NEXT: vslideup.vx v7, v4, a3
; ZVFHMIN-NEXT: bltu a2, a4, .LBB171_6
; ZVFHMIN-NEXT: # %bb.5:
; ZVFHMIN-NEXT: mv a2, a4
; ZVFHMIN-NEXT: .LBB171_6:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: vmv1r.v v0, v5
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: add a5, a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a4, a2, 5
+; ZVFHMIN-NEXT: slli a4, a2, 4
; ZVFHMIN-NEXT: add a2, a4, a2
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v8, v4, a3
+; ZVFHMIN-NEXT: vslideup.vx v8, v6, a3
; ZVFHMIN-NEXT: add a0, a1, a1
; ZVFHMIN-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v8, v5, a1
+; ZVFHMIN-NEXT: vslideup.vx v8, v7, a1
; ZVFHMIN-NEXT: vmv.v.v v0, v8
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a1, a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a1
+; ZVFHMIN-NEXT: slli a1, a0, 5
+; ZVFHMIN-NEXT: add a0, a1, a0
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
; ZVFHMIN-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 9c733b17dc6e9a8..c802ced1e24436b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1084,31 +1084,31 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vl8r.v v8, (a0)
+; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; CHECK-NEXT: vlm.v v0, (a2)
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a2, a0, a1
-; CHECK-NEXT: sub a4, a3, a1
-; CHECK-NEXT: vl8r.v v8, (a2)
-; CHECK-NEXT: sltu a2, a3, a4
-; CHECK-NEXT: vl8r.v v24, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a4
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: sub a2, a3, a1
+; CHECK-NEXT: sltu a4, a3, a2
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: vl8r.v v8, (a0)
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT: vmseq.vv v7, v16, v8, v0.t
; CHECK-NEXT: bltu a3, a1, .LBB96_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a3, a1
; CHECK-NEXT: .LBB96_2:
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
@@ -1119,7 +1119,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t
; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: vmv1r.v v8, v6
+; CHECK-NEXT: vmv1r.v v8, v7
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
@@ -2237,38 +2237,38 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a1, a3, 2
+; CHECK-NEXT: vl8re32.v v8, (a0)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: slli a4, a3, 3
-; CHECK-NEXT: slli a3, a3, 1
+; CHECK-NEXT: slli a1, a3, 1
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sub a5, a2, a3
+; CHECK-NEXT: sub a0, a2, a1
+; CHECK-NEXT: sltu a5, a2, a0
+; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: and a5, a5, a0
+; CHECK-NEXT: srli a0, a3, 2
+; CHECK-NEXT: vslidedown.vx v0, v0, a0
; CHECK-NEXT: vl8re32.v v8, (a4)
-; CHECK-NEXT: sltu a4, a2, a5
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: vl8re32.v v0, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v24, a1
-; CHECK-NEXT: and a4, a4, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vv v7, v16, v8, v0.t
-; CHECK-NEXT: bltu a2, a3, .LBB189_2
+; CHECK-NEXT: bltu a2, a1, .LBB189_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB189_2:
; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t
-; CHECK-NEXT: add a0, a1, a1
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v7, a1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v16, v7, a0
; CHECK-NEXT: vmv1r.v v0, v16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 14f306da21dba77..b3b468596e11015 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -140,27 +140,27 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: li a1, 85
; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v16, a1
+; CHECK-NEXT: li a1, 170
+; CHECK-NEXT: vmv.v.x v17, a1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vl8re64.v v24, (a0)
+; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: li a1, 170
-; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vmv.v.x v17, a1
+; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT: vcompress.vm v8, v24, v16
+; CHECK-NEXT: vcompress.vm v8, v0, v16
; CHECK-NEXT: vmv1r.v v12, v16
; CHECK-NEXT: vmv1r.v v13, v17
-; CHECK-NEXT: vcompress.vm v16, v24, v13
-; CHECK-NEXT: vcompress.vm v24, v0, v12
+; CHECK-NEXT: vcompress.vm v16, v0, v13
+; CHECK-NEXT: vcompress.vm v0, v24, v12
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vcompress.vm v24, v0, v13
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vcompress.vm v0, v24, v13
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v12, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index 5ee5d40d8313de4..4316d5cb403dd2f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -3576,24 +3576,17 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: mv a3, a1
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: vmv8r.v v24, v16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -3601,96 +3594,46 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: slli a5, a1, 3
; CHECK-NEXT: sub a6, a4, a1
-; CHECK-NEXT: add a7, a2, a5
+; CHECK-NEXT: vslidedown.vx v0, v0, a3
+; CHECK-NEXT: add a3, a2, a5
+; CHECK-NEXT: vl8re64.v v8, (a3)
+; CHECK-NEXT: sltu a3, a4, a6
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a3, a3, a6
; CHECK-NEXT: add a5, a0, a5
-; CHECK-NEXT: vl8re64.v v8, (a7)
-; CHECK-NEXT: csrr a7, vlenb
-; CHECK-NEXT: slli a7, a7, 3
-; CHECK-NEXT: add a7, sp, a7
-; CHECK-NEXT: addi a7, a7, 16
-; CHECK-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; CHECK-NEXT: sltu a7, a4, a6
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: vl8re64.v v8, (a5)
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 3
-; CHECK-NEXT: mv t0, a5
-; CHECK-NEXT: slli a5, a5, 2
-; CHECK-NEXT: add a5, a5, t0
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v16, (a5)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a2)
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: and a0, a7, a6
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: slli a2, a2, 2
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a2, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: bltu a4, a1, .LBB128_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a1
; CHECK-NEXT: .LBB128_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add a0, a0, a1
@@ -3706,80 +3649,28 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, i32 zeroext %evl) {
; CHECK-LABEL: vfma_vv_nxv16f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: add a5, a2, a3
-; CHECK-NEXT: vl8re64.v v8, (a5)
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 3
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v0, (a5)
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: sltu a3, a4, a5
-; CHECK-NEXT: vl8re64.v v8, (a2)
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a5
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v24, v16, v8
+; CHECK-NEXT: vfmadd.vv v24, v16, v0
+; CHECK-NEXT: vl8re64.v v0, (a2)
+; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: bltu a4, a1, .LBB129_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a1
; CHECK-NEXT: .LBB129_2:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v0, v16, v8
-; CHECK-NEXT: vmv.v.v v8, v0
+; CHECK-NEXT: vfmadd.vv v16, v8, v0
+; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: vmv8r.v v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%v = call <vscale x 16 x double> @llvm.vp.fma.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 901f3cd63fa9e5f..432994de33321a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1108,20 +1108,15 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 48
+; CHECK-NEXT: li a3, 24
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: vmv8r.v v24, v16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1129,86 +1124,46 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: slli a5, a1, 3
; CHECK-NEXT: sub a6, a4, a1
-; CHECK-NEXT: add a7, a2, a5
+; CHECK-NEXT: vslidedown.vx v0, v0, a3
+; CHECK-NEXT: add a3, a2, a5
+; CHECK-NEXT: vl8re64.v v8, (a3)
+; CHECK-NEXT: sltu a3, a4, a6
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a3, a3, a6
; CHECK-NEXT: add a5, a0, a5
-; CHECK-NEXT: vl8re64.v v8, (a7)
-; CHECK-NEXT: csrr a7, vlenb
-; CHECK-NEXT: slli a7, a7, 3
-; CHECK-NEXT: add a7, sp, a7
-; CHECK-NEXT: addi a7, a7, 16
-; CHECK-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill
-; CHECK-NEXT: sltu a7, a4, a6
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: vl8re64.v v8, (a5)
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: li t0, 40
-; CHECK-NEXT: mul a5, a5, t0
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v16, (a5)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a2)
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: and a0, a7, a6
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 40
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 40
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: bltu a4, a1, .LBB92_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a1
; CHECK-NEXT: .LBB92_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 48
+; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
@@ -1222,76 +1177,28 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, i32 zeroext %evl) {
; CHECK-LABEL: vfma_vv_nxv16f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: add a5, a2, a3
-; CHECK-NEXT: vl8re64.v v8, (a5)
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 3
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v0, (a5)
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: sltu a3, a4, a5
-; CHECK-NEXT: vl8re64.v v8, (a2)
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a5
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v24, v16, v8
+; CHECK-NEXT: vfmadd.vv v24, v16, v0
+; CHECK-NEXT: vl8re64.v v0, (a2)
+; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: bltu a4, a1, .LBB93_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a1
; CHECK-NEXT: .LBB93_2:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v0, v16, v8
-; CHECK-NEXT: vmv.v.v v8, v0
+; CHECK-NEXT: vfmadd.vv v16, v8, v0
+; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: vmv8r.v v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%v = call <vscale x 16 x double> @llvm.vp.fmuladd.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %b, <vscale x 16 x double> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 63156e1399293f3..30fa2a110b2957e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -149,12 +149,12 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a3, a1, 3
@@ -165,18 +165,18 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: add a6, a0, a6
; CHECK-NEXT: sub a5, a2, a4
; CHECK-NEXT: vl8re64.v v24, (a6)
+; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v16, a3
; CHECK-NEXT: sltu a6, a2, a5
; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: and a5, a6, a5
; CHECK-NEXT: sub a6, a5, a1
; CHECK-NEXT: sltu a7, a5, a6
; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v16, a3
-; CHECK-NEXT: and a0, a7, a6
-; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: and a6, a7, a6
+; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma
; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t
+; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: bltu a5, a1, .LBB8_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a5, a1
@@ -185,7 +185,7 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v7, a3
; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t
+; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t
; CHECK-NEXT: bltu a2, a4, .LBB8_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a4
@@ -195,23 +195,22 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT: vfncvt.f.f.w v28, v8, v0.t
+; CHECK-NEXT: vfncvt.f.f.w v12, v24, v0.t
; CHECK-NEXT: bltu a2, a1, .LBB8_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB8_6:
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t
-; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: vfncvt.f.f.w v8, v24, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 4cd77185e693045..cbafda801b2d3b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -566,15 +566,15 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vl8r.v v8, (a0)
; CHECK-NEXT: vlm.v v0, (a2)
; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a2, a0, a1
-; CHECK-NEXT: sub a4, a3, a1
-; CHECK-NEXT: vl8r.v v16, (a2)
-; CHECK-NEXT: sltu a2, a3, a4
-; CHECK-NEXT: vl8r.v v8, (a0)
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a4
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: sub a2, a3, a1
+; CHECK-NEXT: sltu a4, a3, a2
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: vl8r.v v16, (a0)
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: bltu a3, a1, .LBB35_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 647e3965b7ec27e..b8c82d21ad916a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2274,13 +2274,13 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
; RV64-NEXT: slli a3, a1, 3
; RV64-NEXT: add a3, a0, a3
; RV64-NEXT: vl8re64.v v16, (a3)
+; RV64-NEXT: mv a3, a2
; RV64-NEXT: vl8re64.v v24, (a0)
-; RV64-NEXT: mv a0, a2
; RV64-NEXT: bltu a2, a1, .LBB108_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a0, a1
+; RV64-NEXT: mv a3, a1
; RV64-NEXT: .LBB108_2:
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
; RV64-NEXT: sub a0, a2, a1
; RV64-NEXT: srli a1, a1, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index d4ebe27420d7b34..3642d0fadf5f702 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -365,20 +365,20 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: vl8re32.v v8, (a0)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: slli a4, a3, 3
; CHECK-NEXT: slli a1, a3, 1
; CHECK-NEXT: srli a3, a3, 2
-; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sub a5, a2, a1
-; CHECK-NEXT: vl8re32.v v8, (a4)
-; CHECK-NEXT: sltu a4, a2, a5
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: vl8re32.v v0, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v24, a3
-; CHECK-NEXT: and a4, a4, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; CHECK-NEXT: add a0, a0, a4
+; CHECK-NEXT: sub a4, a2, a1
+; CHECK-NEXT: vslidedown.vx v0, v0, a3
+; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: vl8re32.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
; CHECK-NEXT: bltu a2, a1, .LBB27_2
; CHECK-NEXT: # %bb.1:
@@ -424,19 +424,19 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vl8re32.v v8, (a0)
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: slli a2, a1, 1
; CHECK-NEXT: srli a4, a1, 2
-; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: sub a5, a1, a2
-; CHECK-NEXT: vl8re32.v v8, (a3)
-; CHECK-NEXT: sltu a3, a1, a5
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: vl8re32.v v0, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v24, a4
-; CHECK-NEXT: and a3, a3, a5
+; CHECK-NEXT: add a0, a0, a3
+; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: vslidedown.vx v0, v0, a4
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: vl8re32.v v8, (a0)
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
; CHECK-NEXT: bltu a1, a2, .LBB28_2
@@ -713,19 +713,19 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: sub a4, a2, a1
-; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: sltu a5, a2, a4
-; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: srli a3, a1, 3
-; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vslidedown.vx v0, v24, a3
-; CHECK-NEXT: and a4, a5, a4
-; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a3
+; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
; CHECK-NEXT: bltu a2, a1, .LBB48_2
; CHECK-NEXT: # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index fd5bf4ebcede825..343288ad5a91ca4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -290,12 +290,12 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a3, a1, 3
@@ -306,18 +306,18 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: add a6, a0, a6
; CHECK-NEXT: sub a5, a2, a4
; CHECK-NEXT: vl8re64.v v24, (a6)
+; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v16, a3
; CHECK-NEXT: sltu a6, a2, a5
; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: and a5, a6, a5
; CHECK-NEXT: sub a6, a5, a1
; CHECK-NEXT: sltu a7, a5, a6
; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v16, a3
-; CHECK-NEXT: and a0, a7, a6
-; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: and a6, a7, a6
+; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v20, v24, 0, v0.t
+; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: bltu a5, a1, .LBB17_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a5, a1
@@ -326,7 +326,7 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v7, a3
; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT: vnsrl.wi v16, v24, 0, v0.t
; CHECK-NEXT: bltu a2, a4, .LBB17_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a4
@@ -336,23 +336,22 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v28, v8, 0, v0.t
+; CHECK-NEXT: vnsrl.wi v12, v24, 0, v0.t
; CHECK-NEXT: bltu a2, a1, .LBB17_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: .LBB17_6:
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v24, v8, 0, v0.t
-; CHECK-NEXT: vmv8r.v v8, v24
+; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
>From e7f9eee0302d5d867e7b1f62bfd3d647b0686390 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 3 Feb 2025 00:53:59 -0800
Subject: [PATCH 02/11] fixup! Update comment
---
.../include/llvm/CodeGen/TargetRegisterInfo.h | 29 +++++++++++--------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 26107eb8e960b11..270b9cd8de1df58 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1190,6 +1190,23 @@ class TargetRegisterInfo : public MCRegisterInfo {
return false;
}
+ /// Based on the target and current register pressure information from the
+ /// Scheduler, determine whether to release the node in the pending queue
+ virtual bool
+ needReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const {
+ return false;
+ }
+
+ /// For each SUnit, determine whether to release it
+ /// from the pending queue based on the register pressure changes
+ /// associated with that SUnit.
+ virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
+ return false;
+ }
+
//===--------------------------------------------------------------------===//
/// Debug information queries.
@@ -1233,18 +1250,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const {
return {};
}
-
- virtual bool
- needReleasePendingQueue(MachineFunction &MF,
- ArrayRef<unsigned> MaxSetPressure) const {
- return false;
- }
-
- virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
- ArrayRef<unsigned> PSetID,
- ArrayRef<int> UnitInc) const {
- return false;
- }
};
//===----------------------------------------------------------------------===//
>From 7d055c1c8aded2cec0ac12c1851e8c04dbebe894 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Mon, 3 Feb 2025 01:07:13 -0800
Subject: [PATCH 03/11] fixup! add line
---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index fca11f0d1365426..91605c5acda0cb2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -986,4 +986,4 @@ bool RISCVRegisterInfo::needReleaseSUFromPendingQueue(
return true;
}
return false;
-}
\ No newline at end of file
+}
>From c8177f912bf24000e354e4eacc0ec1cf8e1521f8 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Thu, 6 Feb 2025 00:08:21 -0800
Subject: [PATCH 04/11] fixup! consider ReadyListLimit in
bumpCycleUntilReleaseSUFromPending
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 5 +++--
llvm/lib/CodeGen/MachineScheduler.cpp | 2 +-
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 47809606ff40754..64d0314830156fa 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1069,8 +1069,9 @@ class SchedBoundary {
void dumpReservedCycles() const;
void dumpScheduledState() const;
- void bumpCycleUntilReleaseSUFromPending(SUnit *SU) {
- while (!Pending.empty() && llvm::find(Pending, SU) != Pending.end()) {
+ void bumpCycleUntilReleaseSUFromPending(SUnit *SU, unsigned ReadyListLimit) {
+ while (Available.size() < ReadyListLimit && !Pending.empty() &&
+ llvm::find(Pending, SU) != Pending.end()) {
bumpCycle(CurrCycle + 1);
releasePending();
}
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 586c8857bb199fc..9da12317ffb4250 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -3680,7 +3680,7 @@ void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) {
UnitIncs.push_back(PChange.getUnitInc());
}
if (TRI->needReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
- SchedB.bumpCycleUntilReleaseSUFromPending(SU);
+ SchedB.bumpCycleUntilReleaseSUFromPending(SU, ReadyListLimit);
}
};
if (IsTop)
>From 8c496e4de845bf0c8920b48da110ef242acddb78 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Thu, 6 Feb 2025 00:30:10 -0800
Subject: [PATCH 05/11] fixup! need -> should
---
llvm/include/llvm/CodeGen/TargetRegisterInfo.h | 10 +++++-----
llvm/lib/CodeGen/MachineScheduler.cpp | 6 +++---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 4 ++--
llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 10 +++++-----
4 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 270b9cd8de1df58..24174eb1c42a629 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1193,17 +1193,17 @@ class TargetRegisterInfo : public MCRegisterInfo {
/// Based on the target and current register pressure information from the
/// Scheduler, determine whether to release the node in the pending queue
virtual bool
- needReleasePendingQueue(MachineFunction &MF,
- ArrayRef<unsigned> MaxSetPressure) const {
+ shouldReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const {
return false;
}
/// For each SUnit, determine whether to release it
/// from the pending queue based on the register pressure changes
/// associated with that SUnit.
- virtual bool needReleaseSUFromPendingQueue(MachineFunction &MF,
- ArrayRef<unsigned> PSetID,
- ArrayRef<int> UnitInc) const {
+ virtual bool shouldReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const {
return false;
}
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 9da12317ffb4250..c777afb30023f0c 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -3679,7 +3679,7 @@ void GenericScheduler::bumpCycleUntilReleaseSUFromPending(bool IsTop) {
PSetIDs.push_back(PChange.getPSet());
UnitIncs.push_back(PChange.getUnitInc());
}
- if (TRI->needReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
+ if (TRI->shouldReleaseSUFromPendingQueue(DAG->MF, PSetIDs, UnitIncs))
SchedB.bumpCycleUntilReleaseSUFromPending(SU, ReadyListLimit);
}
};
@@ -3778,11 +3778,11 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
}
if (EnableReleasePendingQ && !RegionPolicy.OnlyBottomUp &&
- TRI->needReleasePendingQueue(
+ TRI->shouldReleasePendingQueue(
DAG->MF, DAG->getTopRPTracker().getPressure().MaxSetPressure))
bumpCycleUntilReleaseSUFromPending(/*IsTop=*/true);
if (EnableReleasePendingQ && !RegionPolicy.OnlyTopDown &&
- TRI->needReleasePendingQueue(
+ TRI->shouldReleasePendingQueue(
DAG->MF, DAG->getBotRPTracker().getPressure().MaxSetPressure))
bumpCycleUntilReleaseSUFromPending(/*IsTop=*/false);
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 91605c5acda0cb2..da9b725171c19c3 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -955,7 +955,7 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
-bool RISCVRegisterInfo::needReleasePendingQueue(
+bool RISCVRegisterInfo::shouldReleasePendingQueue(
MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
// Consider only the RVV Register, as RVV spilling/reloading has higher
@@ -972,7 +972,7 @@ bool RISCVRegisterInfo::needReleasePendingQueue(
return false;
}
-bool RISCVRegisterInfo::needReleaseSUFromPendingQueue(
+bool RISCVRegisterInfo::shouldReleaseSUFromPendingQueue(
MachineFunction &MF, ArrayRef<unsigned> PSetID,
ArrayRef<int> UnitInc) const {
const int UnitIncRVVRegPressureThreshold = -3;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index faf81b2d8b73d65..026d00f253e9156 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -145,12 +145,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
return RISCVRI::isVRegClass(RC->TSFlags);
}
bool
- needReleasePendingQueue(MachineFunction &MF,
- ArrayRef<unsigned> MaxSetPressure) const override;
+ shouldReleasePendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> MaxSetPressure) const override;
- bool needReleaseSUFromPendingQueue(MachineFunction &MF,
- ArrayRef<unsigned> PSetID,
- ArrayRef<int> UnitInc) const override;
+ bool shouldReleaseSUFromPendingQueue(MachineFunction &MF,
+ ArrayRef<unsigned> PSetID,
+ ArrayRef<int> UnitInc) const override;
};
} // namespace llvm
>From 9d90fd8c8057b7dbcdf8a47e8899ede574878c0b Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Thu, 6 Feb 2025 00:38:38 -0800
Subject: [PATCH 06/11] fixup! update format
---
llvm/include/llvm/CodeGen/TargetRegisterInfo.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 24174eb1c42a629..62e622800d70ff6 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1198,9 +1198,8 @@ class TargetRegisterInfo : public MCRegisterInfo {
return false;
}
- /// For each SUnit, determine whether to release it
- /// from the pending queue based on the register pressure changes
- /// associated with that SUnit.
+ /// For each SUnit, determine whether to release it from the pending queue
+ /// based on the register pressure changes associated with that SUnit.
virtual bool shouldReleaseSUFromPendingQueue(MachineFunction &MF,
ArrayRef<unsigned> PSetID,
ArrayRef<int> UnitInc) const {
>From 17e19f0444c8e400b3beba697335ae54233bce89 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Thu, 6 Feb 2025 00:46:46 -0800
Subject: [PATCH 07/11] fixup! drop pending.empty()
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 64d0314830156fa..f89fdf53801bfd8 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1070,7 +1070,7 @@ class SchedBoundary {
void dumpScheduledState() const;
void bumpCycleUntilReleaseSUFromPending(SUnit *SU, unsigned ReadyListLimit) {
- while (Available.size() < ReadyListLimit && !Pending.empty() &&
+ while (Available.size() < ReadyListLimit &&
llvm::find(Pending, SU) != Pending.end()) {
bumpCycle(CurrCycle + 1);
releasePending();
>From 900ac8d49dcca4d58fe173885ef004d44100d6d8 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Thu, 6 Feb 2025 01:47:23 -0800
Subject: [PATCH 08/11] fixup! early return when doesn't has VInstruction
---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index da9b725171c19c3..cfdc50ed23410da 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -957,6 +957,10 @@ bool RISCVRegisterInfo::getRegAllocationHints(
bool RISCVRegisterInfo::shouldReleasePendingQueue(
MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
+
+ if (!MF.getSubtarget<RISCVSubtarget>().hasVInstructions())
+ return false;
+
for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
// Consider only the RVV Register, as RVV spilling/reloading has higher
// potential costs than hazards.
>From 6db2bfd924ba7f2f2b4ba225edf3c9af19ce7416 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Sun, 9 Feb 2025 18:38:06 -0800
Subject: [PATCH 09/11] fixup! add early exit to
shouldReleaseSUFromPendingQueue
---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index cfdc50ed23410da..a422c6ffbbce167 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -979,6 +979,10 @@ bool RISCVRegisterInfo::shouldReleasePendingQueue(
bool RISCVRegisterInfo::shouldReleaseSUFromPendingQueue(
MachineFunction &MF, ArrayRef<unsigned> PSetID,
ArrayRef<int> UnitInc) const {
+
+ if (!MF.getSubtarget<RISCVSubtarget>().hasVInstructions())
+ return false;
+
const int UnitIncRVVRegPressureThreshold = -3;
for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
// Consider only the RVV Register, as RVV spilling/reloading has higher
>From 01722cbf83721a7d10eb62b37d52dc40bb4d2f21 Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Sun, 9 Feb 2025 19:17:06 -0800
Subject: [PATCH 10/11] fixup! make threshold as cl:opt
---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index a422c6ffbbce167..b97071f1ba5fcb3 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -37,6 +37,18 @@ static cl::opt<bool>
cl::desc("Disable two address hints for register "
"allocation"));
+static cl::opt<unsigned> RVVReleasePendingPressureThreshold(
+ "riscv-release-pending-queue-rvv-pressure-threshold", cl::Hidden,
+ cl::desc("Release pending queue when vector register pressure exceeds "
+ "threshold"),
+ cl::init(7));
+
+static cl::opt<int> UnitIncRVVRegPressureThreshold(
+ "riscv-release-pending-queue-unit-inc-rvv-pressure-threshold", cl::Hidden,
+ cl::desc("Release pending queue when vector register unit pressure "
+ "increase exceeds threshold"),
+ cl::init(-3));
+
static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive");
static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive");
static_assert(RISCV::F1_H == RISCV::F0_H + 1, "Register list not consecutive");
@@ -967,8 +979,7 @@ bool RISCVRegisterInfo::shouldReleasePendingQueue(
if (!StringRef(getRegPressureSetName(Idx)).starts_with("VM") &&
!StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
continue;
- const unsigned RVVRegPressureThreshold = 7;
- if (MaxSetPressure[Idx] + RVVRegPressureThreshold >
+ if (MaxSetPressure[Idx] + RVVReleasePendingPressureThreshold >
getRegPressureSetLimit(MF, Idx)) {
return true;
}
@@ -983,7 +994,6 @@ bool RISCVRegisterInfo::shouldReleaseSUFromPendingQueue(
if (!MF.getSubtarget<RISCVSubtarget>().hasVInstructions())
return false;
- const int UnitIncRVVRegPressureThreshold = -3;
for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
// Consider only the RVV Register, as RVV spilling/reloading has higher
// potential costs than hazards.
>From 99a542846d1cb13bb31244043d04a5d676cb336e Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen at sifive.com>
Date: Sun, 9 Feb 2025 20:50:16 -0800
Subject: [PATCH 11/11] fixup! Use RISCV::RegisterPressureSets idx directly
---
llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b97071f1ba5fcb3..99292bfc7f5fda2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -967,6 +967,13 @@ bool RISCVRegisterInfo::getRegAllocationHints(
return BaseImplRetVal;
}
+static bool isRVVPressureSetIdx(unsigned Idx) {
+ static std::vector<unsigned> RVVPressureSetIndices = {
+ RISCV::RegisterPressureSets::VRM8NoV0, RISCV::RegisterPressureSets::VM};
+
+ return llvm::find(RVVPressureSetIndices, Idx) != RVVPressureSetIndices.end();
+}
+
bool RISCVRegisterInfo::shouldReleasePendingQueue(
MachineFunction &MF, ArrayRef<unsigned> MaxSetPressure) const {
@@ -976,8 +983,7 @@ bool RISCVRegisterInfo::shouldReleasePendingQueue(
for (unsigned Idx = 0; Idx < MaxSetPressure.size(); Idx++) {
// Consider only the RVV Register, as RVV spilling/reloading has higher
// potential costs than hazards.
- if (!StringRef(getRegPressureSetName(Idx)).starts_with("VM") &&
- !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ if (!isRVVPressureSetIdx(Idx))
continue;
if (MaxSetPressure[Idx] + RVVReleasePendingPressureThreshold >
getRegPressureSetLimit(MF, Idx)) {
@@ -997,8 +1003,7 @@ bool RISCVRegisterInfo::shouldReleaseSUFromPendingQueue(
for (unsigned Idx = 0; Idx < PSetID.size(); Idx++) {
// Consider only the RVV Register, as RVV spilling/reloading has higher
// potential costs than hazards.
- if (!StringRef(getRegPressureSetName(PSetID[Idx])).starts_with("VM") &&
- !StringRef(getRegPressureSetName(Idx)).starts_with("VRM8NoV0"))
+ if (!isRVVPressureSetIdx(PSetID[Idx]))
continue;
if (UnitInc[Idx] < UnitIncRVVRegPressureThreshold)
return true;
More information about the llvm-commits
mailing list