[llvm] 76cb984 - [RISCV] Sources of vmerge shouldn't overlap V0 (#170070)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 18:55:42 PST 2025
Author: Pengcheng Wang
Date: 2025-12-03T10:55:38+08:00
New Revision: 76cb98442b280bf9b5862f0bec3a56c2cc37d70f
URL: https://github.com/llvm/llvm-project/commit/76cb98442b280bf9b5862f0bec3a56c2cc37d70f
DIFF: https://github.com/llvm/llvm-project/commit/76cb98442b280bf9b5862f0bec3a56c2cc37d70f.diff
LOG: [RISCV] Sources of vmerge shouldn't overlap V0 (#170070)
According to the spec:
> A vector register cannot be used to provide source operands with more
> than one EEW for a single instruction. A mask register source is
> considered to have EEW=1 for this constraint.
There must be a mask `V0` in `vmerge` variants so the sources should
use register classes without `V0`.
This fixes #169905.
Co-authored-by: Luke Lau <luke at igalia.com>
Added:
Modified:
llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
llvm/test/CodeGen/RISCV/rvv/copyprop.mir
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
llvm/test/CodeGen/RISCV/rvv/pr88576.ll
llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
llvm/test/CodeGen/RISCV/rvv/vl-optimizer-subreg-assert.mir
llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index eb3c9b0defccb..e36204c536c0d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2982,21 +2982,21 @@ multiclass VPseudoVFWALU_WV_WF_RM {
multiclass VPseudoVMRG_VM_XM_IM {
foreach m = MxList in {
defvar mx = m.MX;
- def "_VVM" # "_" # m.MX:
- VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
- m.vrclass, m.vrclass, m>,
- SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx,
- forcePassthruRead=true>;
- def "_VXM" # "_" # m.MX:
- VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
- m.vrclass, GPR, m>,
- SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx,
- forcePassthruRead=true>;
- def "_VIM" # "_" # m.MX:
- VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
- m.vrclass, simm5, m>,
- SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx,
- forcePassthruRead=true>;
+ def "_VVM"#"_"#m.MX : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
+ GetVRegNoV0<m.vrclass>.R,
+ GetVRegNoV0<m.vrclass>.R, m>,
+ SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx,
+ forcePassthruRead = true>;
+ def "_VXM"#"_"#m.MX
+ : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
+ GetVRegNoV0<m.vrclass>.R, GPR, m>,
+ SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx,
+ forcePassthruRead = true>;
+ def "_VIM"#"_"#m.MX
+ : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
+ GetVRegNoV0<m.vrclass>.R, simm5, m>,
+ SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx,
+ forcePassthruRead = true>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index e1ff243bb1a47..5acb7f5bcd56a 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -73,7 +73,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
bool isAllOnesMask(const MachineInstr *MaskDef) const;
std::optional<unsigned> getConstant(const MachineOperand &VL) const;
bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const;
- bool isKnownSameDefs(Register A, Register B) const;
+ Register lookThruCopies(Register Reg) const;
};
} // namespace
@@ -387,23 +387,18 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const {
return true;
}
-bool RISCVVectorPeephole::isKnownSameDefs(Register A, Register B) const {
- if (A.isPhysical() || B.isPhysical())
- return false;
-
- auto LookThruVirtRegCopies = [this](Register Reg) {
- while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) {
- if (!Def->isFullCopy())
- break;
- Register Src = Def->getOperand(1).getReg();
- if (!Src.isVirtual())
- break;
- Reg = Src;
- }
- return Reg;
- };
-
- return LookThruVirtRegCopies(A) == LookThruVirtRegCopies(B);
+// If \p Reg is defined by one or more COPYs of virtual registers, traverses
+// the chain and returns the root non-COPY source.
+Register RISCVVectorPeephole::lookThruCopies(Register Reg) const {
+ while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) {
+ if (!Def->isFullCopy())
+ break;
+ Register Src = Def->getOperand(1).getReg();
+ if (!Src.isVirtual())
+ break;
+ Reg = Src;
+ }
+ return Reg;
}
/// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the
@@ -428,10 +423,11 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
if (!TrueMaskedInfo || !hasSameEEW(MI, *True))
return false;
- const MachineOperand &TrueMask =
- True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs());
- const MachineOperand &MIMask = MI.getOperand(4);
- if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
+ Register TrueMaskReg = lookThruCopies(
+ True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs())
+ .getReg());
+ Register MIMaskReg = lookThruCopies(MI.getOperand(4).getReg());
+ if (!TrueMaskReg.isVirtual() || TrueMaskReg != MIMaskReg)
return false;
// Masked off lanes past TrueVL will come from False, and converting to vmv
@@ -717,9 +713,9 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const {
if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMERGE_VVM)
return false;
- Register PassthruReg = MI.getOperand(1).getReg();
- Register FalseReg = MI.getOperand(2).getReg();
- Register TrueReg = MI.getOperand(3).getReg();
+ Register PassthruReg = lookThruCopies(MI.getOperand(1).getReg());
+ Register FalseReg = lookThruCopies(MI.getOperand(2).getReg());
+ Register TrueReg = lookThruCopies(MI.getOperand(3).getReg());
if (!TrueReg.isVirtual() || !MRI->hasOneUse(TrueReg))
return false;
MachineInstr &True = *MRI->getUniqueVRegDef(TrueReg);
@@ -740,16 +736,17 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const {
// We require that either passthru and false are the same, or that passthru
// is undefined.
- if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg))
+ if (PassthruReg && !(PassthruReg.isVirtual() && PassthruReg == FalseReg))
return false;
std::optional<std::pair<unsigned, unsigned>> NeedsCommute;
// If True has a passthru operand then it needs to be the same as vmerge's
// False, since False will be used for the result's passthru operand.
- Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg();
+ Register TruePassthru =
+ lookThruCopies(True.getOperand(True.getNumExplicitDefs()).getReg());
if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru &&
- !isKnownSameDefs(TruePassthru, FalseReg)) {
+ !(TruePassthru.isVirtual() && TruePassthru == FalseReg)) {
// If True's passthru != False, check if it uses False in another operand
// and try to commute it.
int OtherIdx = True.findRegisterUseOperandIdx(FalseReg, TRI);
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
index f8061462c6220..ada76a43639d7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
@@ -11,7 +11,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv1i8
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -19,7 +19,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv1i8
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -40,7 +40,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv4i8
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -48,7 +48,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv4i8
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -69,7 +69,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv16i8
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
@@ -77,7 +77,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv16i8
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
@@ -98,7 +98,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv64i8
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -106,7 +106,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv64i8
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -127,7 +127,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv2i16
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -135,7 +135,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv2i16
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -156,7 +156,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv8i16
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
@@ -164,7 +164,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv8i16
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]]
@@ -185,7 +185,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv32i16
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
@@ -193,7 +193,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv32i16
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
@@ -214,7 +214,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv2i32
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
@@ -222,7 +222,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv2i32
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
@@ -243,7 +243,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv8i32
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
@@ -251,7 +251,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv8i32
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
@@ -272,7 +272,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv1i64
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
@@ -280,7 +280,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv1i64
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]]
@@ -301,7 +301,7 @@ body: |
bb.0.entry:
; RV32I-LABEL: name: select_nxv4i64
; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+ ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
@@ -309,7 +309,7 @@ body: |
;
; RV64I-LABEL: name: select_nxv4i64
; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+ ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */
; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
index 2d4fce68f9545..96252f070a580 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
@@ -311,10 +311,10 @@ define i32 @test_nxv128i1(<vscale x 128 x i1> %x) {
; CHECK-NEXT: vmerge.vim v16, v16, 1, v0
; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v6, a0
+; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v7, a1
-; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v7, a0
; CHECK-NEXT: vslidedown.vx v5, v6, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
index 31e79e58f44c5..aba75ffe29d33 100644
--- a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
@@ -43,7 +43,7 @@ body: |
%2:gpr = COPY $x11
%1:gpr = COPY $x10
%3:vr = COPY $v8
- %17:vr = PseudoVSLL_VI_M1 undef $noreg, %3, 5, 1, 6 /* e64 */, 0
+ %17:vrnov0 = PseudoVSLL_VI_M1 undef $noreg, %3, 5, 1, 6 /* e64 */, 0
%22:vr = PseudoVMSNE_VI_M1 %3, 0, 1, 6 /* e64 */
%23:vmv0 = COPY %22
%25:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, %17, -1, %23, 1, 6 /* e64 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 5567310bb2a61..9b35860904f11 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -530,290 +530,267 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 100
+; RV32-NEXT: li a3, 84
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 100 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb
; RV32-NEXT: addi a4, a1, 128
; RV32-NEXT: addi a5, a1, 256
; RV32-NEXT: li a2, 32
; RV32-NEXT: lui a3, 12
-; RV32-NEXT: lui a6, 12291
-; RV32-NEXT: lui a7, %hi(.LCPI27_0)
-; RV32-NEXT: addi a7, a7, %lo(.LCPI27_0)
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vle32.v v24, (a5)
-; RV32-NEXT: vmv.s.x v0, a3
+; RV32-NEXT: lui a5, 12291
+; RV32-NEXT: vmv.s.x v3, a3
; RV32-NEXT: vle32.v v8, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: li a6, 76
+; RV32-NEXT: mul a1, a1, a6
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a6, a6, 3
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV32-NEXT: vslideup.vi v16, v24, 4
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a5, 76
-; RV32-NEXT: mul a1, a1, a5
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a5, 92
-; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: li a6, 60
+; RV32-NEXT: mul a1, a1, a6
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vmv1r.v v30, v0
+; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
; RV32-NEXT: vslideup.vi v16, v8, 10, v0.t
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a5, 72
-; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: li a6, 56
+; RV32-NEXT: mul a1, a1, a6
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vle32.v v8, (a4)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 84
+; RV32-NEXT: li a4, 68
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi a5, a5, 3
+; RV32-NEXT: vmv.s.x v0, a5
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vle16.v v28, (a7)
-; RV32-NEXT: vmv.s.x v0, a6
+; RV32-NEXT: vslideup.vi v28, v24, 2
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: li a4, 76
+; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 84
+; RV32-NEXT: li a4, 68
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v16, v8, v16, v0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v0, v16, v28
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 52
+; RV32-NEXT: li a4, 44
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
-; RV32-NEXT: vslideup.vi v8, v24, 2
-; RV32-NEXT: vmv1r.v v0, v30
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 92
+; RV32-NEXT: li a4, 60
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vslideup.vi v8, v16, 8, v0.t
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT: vslideup.vi v28, v8, 8, v0.t
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 60
+; RV32-NEXT: li a4, 52
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT: lui a7, 49164
-; RV32-NEXT: lui a1, %hi(.LCPI27_1)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_1)
-; RV32-NEXT: lui t2, 3
-; RV32-NEXT: lui t1, 196656
-; RV32-NEXT: lui a4, %hi(.LCPI27_3)
-; RV32-NEXT: addi a4, a4, %lo(.LCPI27_3)
-; RV32-NEXT: lui t0, 786624
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: lui a6, 768
-; RV32-NEXT: addi a7, a7, 12
-; RV32-NEXT: vmv.s.x v0, a7
-; RV32-NEXT: addi t2, t2, 3
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: li t3, 84
-; RV32-NEXT: mul a7, a7, t3
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vl8r.v v16, (a7) # vscale x 64-byte Folded Reload
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: slli a7, a7, 6
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v16, v8, v0
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: slli a7, a7, 5
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vmv.s.x v0, t2
-; RV32-NEXT: addi a7, t1, 48
-; RV32-NEXT: csrr t1, vlenb
-; RV32-NEXT: li t2, 92
-; RV32-NEXT: mul t1, t1, t2
-; RV32-NEXT: add t1, sp, t1
-; RV32-NEXT: addi t1, t1, 16
-; RV32-NEXT: vl8r.v v24, (t1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: csrr t1, vlenb
+; RV32-NEXT: vs4r.v v28, (a1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: lui a1, %hi(.LCPI27_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI27_0)
+; RV32-NEXT: lui a6, 49164
+; RV32-NEXT: lui t1, 3
+; RV32-NEXT: lui t0, 196656
+; RV32-NEXT: lui a7, 786624
+; RV32-NEXT: li a4, 48
+; RV32-NEXT: lui a5, 768
+; RV32-NEXT: addi a6, a6, 12
+; RV32-NEXT: vmv.s.x v0, a6
+; RV32-NEXT: addi t1, t1, 3
+; RV32-NEXT: csrr a6, vlenb
; RV32-NEXT: li t2, 76
-; RV32-NEXT: mul t1, t1, t2
-; RV32-NEXT: add t1, sp, t1
-; RV32-NEXT: addi t1, t1, 16
-; RV32-NEXT: vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: mul a6, a6, t2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vl8r.v v16, (a6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li t2, 68
+; RV32-NEXT: mul a6, a6, t2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li t2, 28
+; RV32-NEXT: mul a6, a6, t2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vs8r.v v8, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmv.s.x v0, t1
+; RV32-NEXT: addi a6, t0, 48
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: li t1, 60
+; RV32-NEXT: mul t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 16
+; RV32-NEXT: vl8r.v v8, (t0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
-; RV32-NEXT: csrr t1, vlenb
-; RV32-NEXT: li t2, 44
-; RV32-NEXT: mul t1, t1, t2
-; RV32-NEXT: add t1, sp, t1
-; RV32-NEXT: addi t1, t1, 16
-; RV32-NEXT: vs4r.v v8, (t1) # vscale x 32-byte Folded Spill
-; RV32-NEXT: vmv.s.x v0, a7
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: li t1, 36
+; RV32-NEXT: mul t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 16
+; RV32-NEXT: vs4r.v v8, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vmv.s.x v0, a6
; RV32-NEXT: addi a3, a3, 12
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: slli a7, a7, 6
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v16, v24, v0
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: slli a7, a7, 4
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vmv8r.v v16, v24
-; RV32-NEXT: vmv.s.x v0, a3
-; RV32-NEXT: addi a3, t0, 192
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: li t0, 92
-; RV32-NEXT: mul a7, a7, t0
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
-; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: csrr a6, vlenb
; RV32-NEXT: li t0, 76
-; RV32-NEXT: mul a7, a7, t0
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: li t0, 48
-; RV32-NEXT: mul a7, a7, t0
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vs4r.v v8, (a7) # vscale x 32-byte Folded Spill
-; RV32-NEXT: vmv.s.x v0, a3
-; RV32-NEXT: li a3, 192
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: li t0, 84
-; RV32-NEXT: mul a7, a7, t0
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
+; RV32-NEXT: mul a6, a6, t0
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vl8r.v v16, (a6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li t0, 68
+; RV32-NEXT: mul a6, a6, t0
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV32-NEXT: csrr a7, vlenb
-; RV32-NEXT: li t0, 24
-; RV32-NEXT: mul a7, a7, t0
-; RV32-NEXT: add a7, sp, a7
-; RV32-NEXT: addi a7, a7, 16
-; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vmv.s.x v0, a5
-; RV32-NEXT: addi a5, a6, 768
; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: li a7, 92
-; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: li t0, 20
+; RV32-NEXT: mul a6, a6, t0
; RV32-NEXT: add a6, sp, a6
; RV32-NEXT: addi a6, a6, 16
-; RV32-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vs8r.v v8, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmv.s.x v0, a3
+; RV32-NEXT: addi a3, a7, 192
; RV32-NEXT: csrr a6, vlenb
-; RV32-NEXT: li a7, 76
+; RV32-NEXT: li a7, 60
; RV32-NEXT: mul a6, a6, a7
; RV32-NEXT: add a6, sp, a6
; RV32-NEXT: addi a6, a6, 16
; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a6, vlenb
; RV32-NEXT: li a7, 40
; RV32-NEXT: mul a6, a6, a7
; RV32-NEXT: add a6, sp, a6
; RV32-NEXT: addi a6, a6, 16
; RV32-NEXT: vs4r.v v8, (a6) # vscale x 32-byte Folded Spill
-; RV32-NEXT: vmv.s.x v0, a5
-; RV32-NEXT: vle16.v v6, (a1)
-; RV32-NEXT: vle16.v v2, (a4)
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a4, 84
-; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmv.s.x v0, a3
+; RV32-NEXT: li a3, 192
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 76
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vl8r.v v16, (a6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 68
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vs8r.v v8, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmv.s.x v0, a4
+; RV32-NEXT: addi a4, a5, 768
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li a6, 60
+; RV32-NEXT: mul a5, a5, a6
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: slli a5, a5, 4
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs4r.v v8, (a5) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vmv.s.x v0, a4
+; RV32-NEXT: vle16.v v2, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: li a4, 76
+; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vmv.s.x v0, a3
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: li a4, 68
+; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v24, v8, v6
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 92
-; RV32-NEXT: mul a1, a1, a3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmv.s.x v0, a3
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 76
+; RV32-NEXT: li a3, 60
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v24, v8, v0
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 92
+; RV32-NEXT: li a3, 60
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: li a3, 44
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v24, v8, v2
-; RV32-NEXT: lui a1, %hi(.LCPI27_2)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_2)
+; RV32-NEXT: lui a1, %hi(.LCPI27_1)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI27_1)
; RV32-NEXT: lui a3, 3073
; RV32-NEXT: addi a3, a3, -1024
; RV32-NEXT: vmv.s.x v0, a3
-; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT: vle16.v v3, (a1)
+; RV32-NEXT: vle16.v v30, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a3, 84
+; RV32-NEXT: li a3, 76
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a3, 68
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
@@ -821,179 +798,191 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 72
+; RV32-NEXT: li a2, 76
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v28, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: lui a1, %hi(.LCPI27_3)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI27_3)
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vle16.v v28, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 52
+; RV32-NEXT: li a2, 28
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v28, v16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vrgatherei16.vv v16, v8, v30
+; RV32-NEXT: lui a1, %hi(.LCPI27_2)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI27_2)
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vle16.v v20, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 72
+; RV32-NEXT: li a2, 20
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v28, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vrgatherei16.vv v8, v0, v28
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 60
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload
-; RV32-NEXT: vmv.v.v v20, v16
+; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v12, v24
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 60
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 44
+; RV32-NEXT: li a2, 52
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v16, (a1) # vscale x 32-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v20, v16, v3
-; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v20, v24
+; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmv.v.v v12, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a2, 52
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: li a2, 36
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v24, v12, v20
+; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v24, v8
; RV32-NEXT: lui a1, %hi(.LCPI27_4)
; RV32-NEXT: addi a1, a1, %lo(.LCPI27_4)
; RV32-NEXT: lui a2, %hi(.LCPI27_5)
; RV32-NEXT: addi a2, a2, %lo(.LCPI27_5)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vle16.v v24, (a2)
+; RV32-NEXT: vle16.v v28, (a2)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT: vle16.v v16, (a1)
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 84
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs1r.v v16, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vle16.v v1, (a1)
; RV32-NEXT: lui a1, %hi(.LCPI27_7)
; RV32-NEXT: addi a1, a1, %lo(.LCPI27_7)
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle16.v v16, (a1)
+; RV32-NEXT: vle16.v v2, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 76
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vrgatherei16.vv v8, v16, v28
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
+; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vrgatherei16.vv v16, v0, v24
+; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vrgatherei16.vv v28, v12, v1
+; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT: vmv.v.v v28, v8
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vrgatherei16.vv v16, v8, v2
+; RV32-NEXT: lui a1, %hi(.LCPI27_6)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI27_6)
+; RV32-NEXT: lui a2, %hi(.LCPI27_8)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI27_8)
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vle16.v v8, (a1)
+; RV32-NEXT: lui a1, %hi(.LCPI27_9)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI27_9)
+; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT: vle16.v v10, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 48
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: li a3, 44
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vs2r.v v10, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vle16.v v9, (a2)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 84
+; RV32-NEXT: li a2, 68
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl1r.v v7, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v24, v20, v7
+; RV32-NEXT: vs1r.v v9, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vrgatherei16.vv v20, v12, v8
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v24, v16
+; RV32-NEXT: vmv.v.v v20, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: li a2, 76
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 76
+; RV32-NEXT: li a2, 44
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl2r.v v28, (a1) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v16, v0, v28
-; RV32-NEXT: lui a1, %hi(.LCPI27_6)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_6)
-; RV32-NEXT: lui a2, %hi(.LCPI27_8)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI27_8)
-; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT: vle16.v v4, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI27_9)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI27_9)
-; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vle16.v v6, (a1)
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle16.v v5, (a2)
+; RV32-NEXT: vrgatherei16.vv v8, v0, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
+; RV32-NEXT: li a2, 60
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload
-; RV32-NEXT: vrgatherei16.vv v0, v20, v4
-; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v0, v16
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vrgatherei16.vv v16, v8, v6
+; RV32-NEXT: vl4r.v v16, (a1) # vscale x 32-byte Folded Reload
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 92
+; RV32-NEXT: li a2, 68
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vl1r.v v7, (a1) # vscale x 8-byte Folded Reload
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vrgatherei16.vv v8, v12, v5
+; RV32-NEXT: vrgatherei16.vv v12, v16, v7
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT: vmv.v.v v8, v16
+; RV32-NEXT: vmv.v.v v12, v8
; RV32-NEXT: addi a1, a0, 320
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vse32.v v8, (a1)
+; RV32-NEXT: vse32.v v12, (a1)
; RV32-NEXT: addi a1, a0, 256
-; RV32-NEXT: vse32.v v0, (a1)
+; RV32-NEXT: vse32.v v20, (a1)
; RV32-NEXT: addi a1, a0, 192
-; RV32-NEXT: vse32.v v24, (a1)
+; RV32-NEXT: vse32.v v28, (a1)
; RV32-NEXT: addi a1, a0, 128
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 6
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload
-; RV32-NEXT: vse32.v v8, (a1)
+; RV32-NEXT: vse32.v v24, (a1)
; RV32-NEXT: addi a1, a0, 64
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 60
+; RV32-NEXT: li a3, 52
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 72
+; RV32-NEXT: li a2, 56
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
; RV32-NEXT: vse32.v v8, (a0)
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 100
+; RV32-NEXT: li a1, 84
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
@@ -1013,60 +1002,48 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 53
+; RV64-NEXT: li a3, 85
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
; RV64-NEXT: addi a2, a1, 128
-; RV64-NEXT: addi a3, a1, 256
-; RV64-NEXT: li a4, 128
+; RV64-NEXT: addi a1, a1, 256
+; RV64-NEXT: vle64.v v8, (a1)
+; RV64-NEXT: li a3, 128
; RV64-NEXT: lui a1, 1
-; RV64-NEXT: vle64.v v8, (a3)
-; RV64-NEXT: lui a3, %hi(.LCPI27_0)
-; RV64-NEXT: addi a3, a3, %lo(.LCPI27_0)
-; RV64-NEXT: vmv.s.x v0, a4
-; RV64-NEXT: csrr a4, vlenb
-; RV64-NEXT: li a5, 61
-; RV64-NEXT: mul a4, a4, a5
-; RV64-NEXT: add a4, sp, a4
-; RV64-NEXT: addi a4, a4, 16
-; RV64-NEXT: vs1r.v v0, (a4) # vscale x 8-byte Folded Spill
-; RV64-NEXT: addi a4, a1, 65
+; RV64-NEXT: vmv.s.x v3, a3
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-NEXT: vslideup.vi v24, v8, 2
; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma
; RV64-NEXT: vslidedown.vi v16, v8, 8
-; RV64-NEXT: csrr a5, vlenb
-; RV64-NEXT: li a6, 77
-; RV64-NEXT: mul a5, a5, a6
-; RV64-NEXT: add a5, sp, a5
-; RV64-NEXT: addi a5, a5, 16
-; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT: csrr a5, vlenb
-; RV64-NEXT: li a6, 77
-; RV64-NEXT: mul a5, a5, a6
-; RV64-NEXT: add a5, sp, a5
-; RV64-NEXT: addi a5, a5, 16
-; RV64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 45
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vslideup.vi v24, v16, 5, v0.t
-; RV64-NEXT: csrr a5, vlenb
-; RV64-NEXT: li a6, 73
-; RV64-NEXT: mul a5, a5, a6
-; RV64-NEXT: add a5, sp, a5
-; RV64-NEXT: addi a5, a5, 16
-; RV64-NEXT: vs4r.v v24, (a5) # vscale x 32-byte Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a4, 73
+; RV64-NEXT: mul a3, a3, a4
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vs4r.v v24, (a3) # vscale x 32-byte Folded Spill
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v24, (a2)
+; RV64-NEXT: vle64.v v16, (a2)
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a5, 85
-; RV64-NEXT: mul a2, a2, a5
+; RV64-NEXT: li a3, 77
+; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vle16.v v12, (a3)
-; RV64-NEXT: vmv.s.x v0, a4
+; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: addi a2, a1, 65
+; RV64-NEXT: vmv.s.x v0, a2
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vslideup.vi v12, v8, 1
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: li a3, 85
; RV64-NEXT: mul a2, a2, a3
@@ -1074,35 +1051,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 53
+; RV64-NEXT: li a3, 77
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmerge.vvm v24, v24, v16, v0
-; RV64-NEXT: vrgatherei16.vv v0, v24, v12
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: li a3, 37
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v0, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vslideup.vi v12, v8, 1
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 61
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v7, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 77
+; RV64-NEXT: li a3, 45
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vslideup.vi v12, v24, 4, v0.t
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: vslideup.vi v12, v16, 4, v0.t
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: li a3, 69
; RV64-NEXT: mul a2, a2, a3
@@ -1115,17 +1085,23 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a2, a2, 130
; RV64-NEXT: vmv.s.x v0, a2
; RV64-NEXT: addi a2, a3, 260
-; RV64-NEXT: vmv8r.v v24, v16
; RV64-NEXT: csrr a3, vlenb
; RV64-NEXT: li a5, 85
; RV64-NEXT: mul a3, a3, a5
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 16
+; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: li a5, 77
+; RV64-NEXT: mul a3, a3, a5
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: addi a3, a3, 16
; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: slli a3, a3, 3
+; RV64-NEXT: li a5, 21
+; RV64-NEXT: mul a3, a3, a5
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 16
; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
@@ -1137,6 +1113,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 77
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
; RV64-NEXT: csrr a2, vlenb
@@ -1147,21 +1129,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmv1r.v v0, v2
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 45
+; RV64-NEXT: li a3, 53
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vslideup.vi v12, v8, 5, v0.t
-; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 77
+; RV64-NEXT: li a3, 45
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vrgather.vi v12, v24, 4, v0.t
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vrgather.vi v12, v16, 4, v0.t
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a3, a2, 6
; RV64-NEXT: add a2, a3, a2
@@ -1171,84 +1153,65 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vslidedown.vi v12, v8, 1
; RV64-NEXT: vmv1r.v v0, v2
; RV64-NEXT: vslideup.vi v12, v8, 4, v0.t
-; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vrgather.vi v12, v24, 5, v0.t
+; RV64-NEXT: vmv1r.v v0, v3
+; RV64-NEXT: vmv4r.v v8, v16
+; RV64-NEXT: vrgather.vi v12, v16, 5, v0.t
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 25
-; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: slli a3, a2, 4
+; RV64-NEXT: add a2, a3, a2
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs4r.v v12, (a2) # vscale x 32-byte Folded Spill
; RV64-NEXT: lui a2, 8
; RV64-NEXT: addi a2, a2, 520
; RV64-NEXT: vmv.s.x v0, a2
-; RV64-NEXT: vslideup.vi v12, v24, 6
+; RV64-NEXT: vslideup.vi v4, v16, 6
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: li a3, 85
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 53
+; RV64-NEXT: li a3, 77
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a3, a2, 4
+; RV64-NEXT: slli a3, a2, 3
; RV64-NEXT: add a2, a3, a2
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 77
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmv1r.v v0, v3
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT: lui a2, %hi(.LCPI27_1)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI27_1)
-; RV64-NEXT: li a3, 192
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vle16.v v6, (a2)
-; RV64-NEXT: vmv.s.x v0, a3
+; RV64-NEXT: vslideup.vi v4, v8, 1, v0.t
+; RV64-NEXT: li a2, 192
+; RV64-NEXT: vmv.s.x v0, a2
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 4
+; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs1r.v v0, (a2) # vscale x 8-byte Folded Spill
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 45
+; RV64-NEXT: li a3, 53
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vrgather.vi v28, v16, 2
-; RV64-NEXT: vmerge.vvm v16, v28, v12, v0
+; RV64-NEXT: vrgather.vi v12, v16, 2
+; RV64-NEXT: vmerge.vvm v12, v12, v4, v0
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: li a3, 61
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v24, v16, v6
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: lui a2, %hi(.LCPI27_2)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI27_2)
+; RV64-NEXT: vs4r.v v12, (a2) # vscale x 32-byte Folded Spill
+; RV64-NEXT: lui a2, %hi(.LCPI27_0)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI27_0)
; RV64-NEXT: li a3, 1040
; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: addi a1, a1, -2016
@@ -1259,41 +1222,77 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a3, a3, 16
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: li a4, 53
+; RV64-NEXT: li a4, 77
; RV64-NEXT: mul a3, a3, a4
; RV64-NEXT: add a3, sp, a3
; RV64-NEXT: addi a3, a3, 16
; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmerge.vvm v8, v24, v16, v0
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: slli a3, a3, 3
-; RV64-NEXT: add a3, sp, a3
-; RV64-NEXT: addi a3, a3, 16
-; RV64-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: vle16.v v6, (a2)
-; RV64-NEXT: li a1, 64
-; RV64-NEXT: vmerge.vvm v8, v24, v16, v0
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 85
-; RV64-NEXT: mul a2, a2, a3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
+; RV64-NEXT: addi a3, sp, 16
+; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: vle16.v v12, (a2)
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 29
+; RV64-NEXT: li a2, 85
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 77
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vrgatherei16.vv v24, v16, v6
+; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 85
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: lui a1, %hi(.LCPI27_1)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI27_1)
+; RV64-NEXT: vle16.v v24, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 37
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vrgatherei16.vv v0, v16, v12
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 77
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: lui a1, %hi(.LCPI27_2)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI27_2)
+; RV64-NEXT: vle16.v v12, (a1)
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 21
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vrgatherei16.vv v0, v16, v24
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 37
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 29
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
+; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vrgatherei16.vv v24, v16, v12
; RV64-NEXT: vmv4r.v v28, v8
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vslideup.vi v28, v8, 5, v0.t
@@ -1304,13 +1303,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 37
+; RV64-NEXT: li a2, 77
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v8, v0
+; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 73
; RV64-NEXT: mul a1, a1, a2
@@ -1323,7 +1322,11 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a2, 37
+; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: csrr a1, vlenb
@@ -1335,62 +1338,59 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: lui a1, %hi(.LCPI27_3)
; RV64-NEXT: addi a1, a1, %lo(.LCPI27_3)
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vle16.v v20, (a1)
+; RV64-NEXT: vle16.v v8, (a1)
; RV64-NEXT: lui a1, %hi(.LCPI27_4)
; RV64-NEXT: addi a1, a1, %lo(.LCPI27_4)
-; RV64-NEXT: vle16.v v8, (a1)
+; RV64-NEXT: vle16.v v10, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 77
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
+; RV64-NEXT: vs2r.v v10, (a1) # vscale x 16-byte Folded Spill
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a2, a1, 6
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; RV64-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v8, v24
+; RV64-NEXT: vmv.v.v v12, v24
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a2, a1, 6
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; RV64-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v0, v8, v20
+; RV64-NEXT: vrgatherei16.vv v0, v16, v8
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 25
-; RV64-NEXT: mul a1, a1, a2
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload
+; RV64-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT: vmv.v.v v12, v0
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmv.v.v v20, v0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 77
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
+; RV64-NEXT: vl2r.v v16, (a1) # vscale x 16-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v0, v16, v8
+; RV64-NEXT: vrgatherei16.vv v0, v8, v16
; RV64-NEXT: lui a1, %hi(.LCPI27_5)
; RV64-NEXT: addi a1, a1, %lo(.LCPI27_5)
-; RV64-NEXT: vle16.v v20, (a1)
+; RV64-NEXT: vle16.v v12, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 61
; RV64-NEXT: mul a1, a1, a2
@@ -1406,7 +1406,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: li a2, 45
+; RV64-NEXT: li a2, 53
; RV64-NEXT: mul a1, a1, a2
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
@@ -1414,7 +1414,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-NEXT: vrgather.vi v8, v0, 3
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: slli a1, a1, 3
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload
@@ -1426,7 +1426,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vrgatherei16.vv v24, v0, v20
+; RV64-NEXT: vrgatherei16.vv v24, v0, v12
; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma
; RV64-NEXT: vmv.v.v v8, v24
; RV64-NEXT: addi a1, a0, 320
@@ -1441,7 +1441,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload
; RV64-NEXT: vse64.v v8, (a1)
; RV64-NEXT: addi a1, a0, 192
-; RV64-NEXT: vse64.v v12, (a1)
+; RV64-NEXT: vse64.v v20, (a1)
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a3, a2, 6
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index ffbf1c7a548e1..1ccc52be36215 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1874,57 +1874,77 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: addi a2, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v24, (a2)
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vle32.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfmin.vv v24, v16, v24
+; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vfmin.vv v24, v8, v24
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vmfeq.vv v7, v8, v8
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v8, v16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v8
; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
@@ -1943,7 +1963,10 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: .LBB121_3:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -2257,56 +2280,76 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: addi a1, a0, 128
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: vle64.v v8, (a1)
; RV32-NEXT: addi a1, a0, 256
-; RV32-NEXT: vle64.v v8, (a0)
+; RV32-NEXT: vle64.v v16, (a0)
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmfeq.vv v0, v24, v24
-; RV32-NEXT: vmfeq.vv v7, v16, v16
-; RV32-NEXT: vmerge.vvm v8, v24, v16, v0
+; RV32-NEXT: vmfeq.vv v7, v8, v8
+; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vle64.v v8, (a1)
+; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmerge.vvm v16, v24, v8, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmv1r.v v0, v7
-; RV32-NEXT: vmerge.vvm v16, v16, v24, v0
-; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vfmin.vv v24, v16, v24
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vfmin.vv v24, v8, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmfeq.vv v0, v8, v8
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmfeq.vv v0, v16, v16
-; RV32-NEXT: vmfeq.vv v7, v8, v8
+; RV32-NEXT: vmfeq.vv v7, v16, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmerge.vvm v16, v16, v8, v0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmerge.vvm v16, v16, v8, v0
+; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vfmin.vv v16, v8, v16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vfmin.vv v16, v16, v8
; RV32-NEXT: vmfeq.vv v0, v16, v16
; RV32-NEXT: vmfeq.vv v7, v24, v24
; RV32-NEXT: vmerge.vvm v8, v16, v24, v0
@@ -2325,7 +2368,10 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; RV32-NEXT: vfmv.f.s fa0, v8
; RV32-NEXT: .LBB133_3:
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -2337,56 +2383,76 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v24, (a1)
; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: addi a1, a0, 256
-; RV64-NEXT: vle64.v v8, (a0)
+; RV64-NEXT: vle64.v v16, (a0)
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: slli a0, a0, 4
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmfeq.vv v0, v24, v24
-; RV64-NEXT: vmfeq.vv v7, v16, v16
-; RV64-NEXT: vmerge.vvm v8, v24, v16, v0
+; RV64-NEXT: vmfeq.vv v7, v8, v8
+; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vle64.v v8, (a1)
+; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vmerge.vvm v16, v24, v8, v0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
-; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vfmin.vv v24, v16, v24
+; RV64-NEXT: vmerge.vvm v8, v8, v24, v0
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vfmin.vv v24, v8, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmfeq.vv v0, v8, v8
+; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmfeq.vv v0, v16, v16
-; RV64-NEXT: vmfeq.vv v7, v8, v8
+; RV64-NEXT: vmfeq.vv v7, v16, v16
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmerge.vvm v16, v16, v8, v0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmerge.vvm v16, v16, v8, v0
+; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vfmin.vv v16, v8, v16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vfmin.vv v16, v16, v8
; RV64-NEXT: vmfeq.vv v0, v16, v16
; RV64-NEXT: vmfeq.vv v7, v24, v24
; RV64-NEXT: vmerge.vvm v8, v16, v24, v0
@@ -2406,7 +2472,10 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; RV64-NEXT: vfmv.f.s fa0, v8
; RV64-NEXT: .LBB133_3:
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: .cfi_def_cfa sp, 16
; RV64-NEXT: addi sp, sp, 16
@@ -2702,57 +2771,77 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: addi a2, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v24, (a2)
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vle32.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfmax.vv v24, v16, v24
+; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vfmax.vv v24, v8, v24
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vmfeq.vv v7, v8, v8
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v8, v16
+; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v8
; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
@@ -2771,7 +2860,10 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: .LBB149_3:
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -3085,56 +3177,76 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV32-NEXT: addi a1, a0, 128
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: addi a1, a0, 384
-; RV32-NEXT: vle64.v v16, (a1)
+; RV32-NEXT: vle64.v v8, (a1)
; RV32-NEXT: addi a1, a0, 256
-; RV32-NEXT: vle64.v v8, (a0)
+; RV32-NEXT: vle64.v v16, (a0)
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmfeq.vv v0, v24, v24
-; RV32-NEXT: vmfeq.vv v7, v16, v16
-; RV32-NEXT: vmerge.vvm v8, v24, v16, v0
+; RV32-NEXT: vmfeq.vv v7, v8, v8
+; RV32-NEXT: vle64.v v16, (a1)
; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vle64.v v8, (a1)
+; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vmerge.vvm v16, v24, v8, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmv1r.v v0, v7
-; RV32-NEXT: vmerge.vvm v16, v16, v24, v0
-; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vfmax.vv v24, v16, v24
+; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vfmax.vv v24, v8, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmfeq.vv v0, v8, v8
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmfeq.vv v0, v16, v16
-; RV32-NEXT: vmfeq.vv v7, v8, v8
+; RV32-NEXT: vmfeq.vv v7, v16, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmerge.vvm v16, v16, v8, v0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmerge.vvm v16, v16, v8, v0
+; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vfmax.vv v16, v8, v16
+; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vfmax.vv v16, v16, v8
; RV32-NEXT: vmfeq.vv v0, v16, v16
; RV32-NEXT: vmfeq.vv v7, v24, v24
; RV32-NEXT: vmerge.vvm v8, v16, v24, v0
@@ -3153,7 +3265,10 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; RV32-NEXT: vfmv.f.s fa0, v8
; RV32-NEXT: .LBB161_3:
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -3165,56 +3280,76 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: mv a2, a1
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v24, (a1)
; RV64-NEXT: addi a1, a0, 384
-; RV64-NEXT: vle64.v v16, (a1)
+; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: addi a1, a0, 256
-; RV64-NEXT: vle64.v v8, (a0)
+; RV64-NEXT: vle64.v v16, (a0)
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: slli a0, a0, 4
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmfeq.vv v0, v24, v24
-; RV64-NEXT: vmfeq.vv v7, v16, v16
-; RV64-NEXT: vmerge.vvm v8, v24, v16, v0
+; RV64-NEXT: vmfeq.vv v7, v8, v8
+; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vle64.v v8, (a1)
+; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vmerge.vvm v16, v24, v8, v0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vmerge.vvm v16, v16, v24, v0
-; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vfmax.vv v24, v16, v24
+; RV64-NEXT: vmerge.vvm v8, v8, v24, v0
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vfmax.vv v24, v8, v24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmfeq.vv v0, v8, v8
+; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmfeq.vv v0, v16, v16
-; RV64-NEXT: vmfeq.vv v7, v8, v8
+; RV64-NEXT: vmfeq.vv v7, v16, v16
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmerge.vvm v16, v16, v8, v0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; RV64-NEXT: vmv1r.v v0, v7
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vmerge.vvm v16, v16, v8, v0
+; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vfmax.vv v16, v8, v16
+; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vfmax.vv v16, v16, v8
; RV64-NEXT: vmfeq.vv v0, v16, v16
; RV64-NEXT: vmfeq.vv v7, v24, v24
; RV64-NEXT: vmerge.vvm v8, v16, v24, v0
@@ -3234,7 +3369,10 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; RV64-NEXT: vfmv.f.s fa0, v8
; RV64-NEXT: .LBB161_3:
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: mv a1, a0
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: .cfi_def_cfa sp, 16
; RV64-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index 25a4eb74eeba7..fd70f95ed53c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -121,155 +121,68 @@ define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
}
define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; ZVFH-LABEL: vfmax_nxv32bf16_vv:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: sub sp, sp, a0
-; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vmv8r.v v24, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vmv8r.v v0, v8
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v3, v16, v16
-; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vmv1r.v v0, v3
-; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v4
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfmax.vv v16, v0, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v7, v24, v24
-; ZVFH-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFH-NEXT: vmv1r.v v0, v7
-; ZVFH-NEXT: vmerge.vvm v8, v24, v8, v0
-; ZVFH-NEXT: vfmax.vv v16, v8, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v0, v8
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v4
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmax.vv v16, v0, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: ret
+; CHECK-LABEL: vfmax_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv8r.v v0, v16
+; CHECK-NEXT: vmv8r.v v24, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v3, v16, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmax.vv v16, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
ret <vscale x 32 x bfloat> %v
}
@@ -444,54 +357,45 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: sub sp, sp, a0
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v0, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT: vmv8r.v v0, v16
+; ZVFHMIN-NEXT: vmv8r.v v24, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v3
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmax.vv v16, v0, v16
+; ZVFHMIN-NEXT: vfmax.vv v8, v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
@@ -499,7 +403,7 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
@@ -507,8 +411,7 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index 6ffa71c6c908b..339f97a73ee52 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -121,155 +121,68 @@ define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
}
define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; ZVFH-LABEL: vfmin_nxv32bf16_vv:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: sub sp, sp, a0
-; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vmv8r.v v24, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vmv8r.v v0, v8
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v3, v16, v16
-; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vmv1r.v v0, v3
-; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v4
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfmin.vv v16, v0, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v7, v24, v24
-; ZVFH-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFH-NEXT: vmv1r.v v0, v7
-; ZVFH-NEXT: vmerge.vvm v8, v24, v8, v0
-; ZVFH-NEXT: vfmin.vv v16, v8, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v0, v8
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v4
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmin.vv v16, v0, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: ret
+; CHECK-LABEL: vfmin_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv8r.v v0, v16
+; CHECK-NEXT: vmv8r.v v24, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v3, v16, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmin.vv v16, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
ret <vscale x 32 x bfloat> %v
}
@@ -444,54 +357,45 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: sub sp, sp, a0
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v0, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT: vmv8r.v v0, v16
+; ZVFHMIN-NEXT: vmv8r.v v24, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v3
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmin.vv v16, v0, v16
+; ZVFHMIN-NEXT: vfmin.vv v8, v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
@@ -499,7 +403,7 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
@@ -507,8 +411,7 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
index a967f86f5b930..ba5a05e89c506 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir
@@ -24,8 +24,8 @@ body: |
; CHECK-NEXT: PseudoRET implicit $v0
%0:vr = COPY $v0
%1:vr = COPY $v1
- %2:vr = COPY $v2
- %3:vr = COPY $v3
+ %2:vrnov0 = COPY $v2
+ %3:vrnov0 = COPY $v3
%4:vmv0 = COPY %0
%5:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, killed %2, 1, %4, 1, 3
%6:vmv0 = COPY %1
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
index dd7debd3ab046..a094e35ffde97 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
@@ -31,9 +31,9 @@ define i1 @foo(<vscale x 16 x i8> %x, i64 %y) {
; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vmerge.vim v24, v16, 1, v0
-; CHECK-NEXT: vs8r.v v24, (a1)
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT: vs8r.v v24, (a1)
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: addi sp, s0, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index a7eaf39793236..c73c2004834db 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -10,13 +10,13 @@ body: |
; CHECK-LABEL: name: undef_passthru
; CHECK: liveins: $x1, $v8, $v9
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: %false:vr = COPY $v8
- ; CHECK-NEXT: %true:vr = COPY $v9
+ ; CHECK-NEXT: %false:vrnov0 = COPY $v8
+ ; CHECK-NEXT: %true:vrnov0 = COPY $v9
; CHECK-NEXT: %avl:gprnox0 = COPY $x1
; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */
; CHECK-NEXT: $v0 = COPY %mask
- %false:vr = COPY $v8
- %true:vr = COPY $v9
+ %false:vrnov0 = COPY $v8
+ %true:vrnov0 = COPY $v9
%avl:gprnox0 = COPY $x1
%mask:vmv0 = PseudoVMSET_M_B8 %avl, 0
$v0 = COPY %mask
@@ -31,15 +31,15 @@ body: |
; CHECK: liveins: $x1, $v8, $v9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %pt:vr = COPY $v8
- ; CHECK-NEXT: %false:vr = COPY $noreg
- ; CHECK-NEXT: %true:vr = COPY $v9
+ ; CHECK-NEXT: %false:vrnov0 = COPY $noreg
+ ; CHECK-NEXT: %true:vrnov0 = COPY $v9
; CHECK-NEXT: %avl:gprnox0 = COPY $x1
; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */
; CHECK-NEXT: $v0 = COPY %mask
; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
%pt:vrnov0 = COPY $v8
- %false:vr = COPY $noreg
- %true:vr = COPY $v9
+ %false:vrnov0 = COPY $noreg
+ %true:vrnov0 = COPY $v9
%avl:gprnox0 = COPY $x1
%mask:vmv0 = PseudoVMSET_M_B8 %avl, 0
$v0 = COPY %mask
@@ -53,15 +53,15 @@ body: |
; CHECK-LABEL: name: equal_passthru_false
; CHECK: liveins: $x1, $v8, $v9
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: %false:vr = COPY $v8
+ ; CHECK-NEXT: %false:vrnov0 = COPY $v8
; CHECK-NEXT: %pt:vr = COPY $v8
- ; CHECK-NEXT: %true:vr = COPY $v9
+ ; CHECK-NEXT: %true:vrnov0 = COPY $v9
; CHECK-NEXT: %avl:gprnox0 = COPY $x1
; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */
; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
- %false:vr = COPY $v8
+ %false:vrnov0 = COPY $v8
%pt:vrnov0 = COPY $v8
- %true:vr = COPY $v9
+ %true:vrnov0 = COPY $v9
%avl:gprnox0 = COPY $x1
%mask:vmv0 = PseudoVMSET_M_B8 %avl, 0
%x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl, 5
@@ -136,7 +136,7 @@ body: |
; CHECK-NEXT: %false:vrnov0 = COPY $v8
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
- %false:vr = COPY $v8
+ %false:vrnov0 = COPY $v8
%mask:vmv0 = COPY $v0
%true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
%x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
@@ -150,7 +150,7 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $v8, $v0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: %false:vr = COPY $v8
+ ; CHECK-NEXT: %false:vrnov0 = COPY $v8
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
; CHECK-NEXT: {{ $}}
@@ -158,7 +158,7 @@ body: |
; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
bb.0:
liveins: $v8, $v0
- %false:vr = COPY $v8
+ %false:vrnov0 = COPY $v8
%mask:vmv0 = COPY $v0
%true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
bb.1:
@@ -174,14 +174,14 @@ body: |
; CHECK: liveins: $v8, $v9, $v0, $x8, $x9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %pt:vrnov0 = COPY $v8
- ; CHECK-NEXT: %false:vr = COPY $v9
+ ; CHECK-NEXT: %false:vrnov0 = COPY $v9
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %avl1:gprnox0 = COPY $x8
; CHECK-NEXT: %avl2:gprnox0 = COPY $x9
; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
%pt:vrnov0 = COPY $v8
- %false:vr = COPY $v9
+ %false:vrnov0 = COPY $v9
%mask:vmv0 = COPY $v0
%avl1:gprnox0 = COPY $x8
%avl2:gprnox0 = COPY $x9
@@ -203,7 +203,7 @@ body: |
; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */
; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */
%pt:vrnov0 = COPY $v8
- %false:vr = COPY $v9
+ %false:vrnov0 = COPY $v9
%mask:vmv0 = COPY $v0
%true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */
%5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index acd9519bb5a8e..5be32cc35fe37 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -867,8 +867,9 @@ define void @test_dag_loop() {
; CHECK-NEXT: vmseq.vv v0, v12, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu
+; CHECK-NEXT: vsetivli zero, 1, e16, m8, tu, mu
; CHECK-NEXT: vle16.v v8, (zero), v0.t
+; CHECK-NEXT: vsetivli zero, 0, e16, m8, ta, ma
; CHECK-NEXT: vse16.v v8, (zero)
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index e3f43cd904198..cc389236df3ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -516,15 +516,15 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <
; NOVLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; NOVLDEP-NEXT: vmv1r.v v9, v0
; NOVLDEP-NEXT: vmv1r.v v0, v8
-; NOVLDEP-NEXT: vmv.v.i v24, 0
+; NOVLDEP-NEXT: vmv.v.i v16, 0
; NOVLDEP-NEXT: csrr a0, vlenb
-; NOVLDEP-NEXT: vmerge.vim v16, v24, 1, v0
+; NOVLDEP-NEXT: vmerge.vim v24, v16, 1, v0
; NOVLDEP-NEXT: vmv1r.v v0, v9
-; NOVLDEP-NEXT: vmerge.vim v8, v24, 1, v0
+; NOVLDEP-NEXT: vmerge.vim v8, v16, 1, v0
; NOVLDEP-NEXT: slli a0, a0, 3
; NOVLDEP-NEXT: addi a0, a0, -1
; NOVLDEP-NEXT: vslidedown.vx v8, v8, a0
-; NOVLDEP-NEXT: vslideup.vi v8, v16, 1
+; NOVLDEP-NEXT: vslideup.vi v8, v24, 1
; NOVLDEP-NEXT: vand.vi v8, v8, 1
; NOVLDEP-NEXT: vmsne.vi v0, v8, 0
; NOVLDEP-NEXT: ret
@@ -534,17 +534,17 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <
; VLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; VLDEP-NEXT: vmv1r.v v9, v0
; VLDEP-NEXT: vmv1r.v v0, v8
-; VLDEP-NEXT: vmv.v.i v24, 0
+; VLDEP-NEXT: vmv.v.i v16, 0
; VLDEP-NEXT: csrr a0, vlenb
-; VLDEP-NEXT: vmerge.vim v16, v24, 1, v0
+; VLDEP-NEXT: vmerge.vim v24, v16, 1, v0
; VLDEP-NEXT: vmv1r.v v0, v9
-; VLDEP-NEXT: vmerge.vim v8, v24, 1, v0
+; VLDEP-NEXT: vmerge.vim v8, v16, 1, v0
; VLDEP-NEXT: slli a0, a0, 3
; VLDEP-NEXT: addi a0, a0, -1
; VLDEP-NEXT: vsetivli zero, 1, e8, m8, ta, ma
; VLDEP-NEXT: vslidedown.vx v8, v8, a0
; VLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; VLDEP-NEXT: vslideup.vi v8, v16, 1
+; VLDEP-NEXT: vslideup.vi v8, v24, 1
; VLDEP-NEXT: vand.vi v8, v8, 1
; VLDEP-NEXT: vmsne.vi v0, v8, 0
; VLDEP-NEXT: ret
@@ -558,16 +558,16 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_max(<vscale x 64 x i1> %a, <vsc
; NOVLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; NOVLDEP-NEXT: vmv1r.v v9, v0
; NOVLDEP-NEXT: vmv1r.v v0, v8
-; NOVLDEP-NEXT: vmv.v.i v24, 0
+; NOVLDEP-NEXT: vmv.v.i v16, 0
; NOVLDEP-NEXT: li a0, 127
-; NOVLDEP-NEXT: vmerge.vim v16, v24, 1, v0
+; NOVLDEP-NEXT: vmerge.vim v24, v16, 1, v0
; NOVLDEP-NEXT: vmv1r.v v0, v9
-; NOVLDEP-NEXT: vmerge.vim v8, v24, 1, v0
+; NOVLDEP-NEXT: vmerge.vim v8, v16, 1, v0
; NOVLDEP-NEXT: vslidedown.vx v8, v8, a0
; NOVLDEP-NEXT: csrr a0, vlenb
; NOVLDEP-NEXT: slli a0, a0, 3
; NOVLDEP-NEXT: addi a0, a0, -127
-; NOVLDEP-NEXT: vslideup.vx v8, v16, a0
+; NOVLDEP-NEXT: vslideup.vx v8, v24, a0
; NOVLDEP-NEXT: vand.vi v8, v8, 1
; NOVLDEP-NEXT: vmsne.vi v0, v8, 0
; NOVLDEP-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index cd85853c2d12c..81a2388421cee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -1346,10 +1346,10 @@ name: vmerge_vim
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vim
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VIM_M1 $noreg, %x, 9, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
%y:vrnov0 = PseudoVMERGE_VIM_M1 $noreg, %x, 9, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1358,10 +1358,10 @@ name: vmerge_vim_incompatible_eew
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vim_incompatible_eew
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VIM_M1 $noreg, %x, 9, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
%y:vrnov0 = PseudoVMERGE_VIM_M1 $noreg, %x, 9, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1370,10 +1370,10 @@ name: vmerge_vim_incompatible_emul
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vim_incompatible_emul
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VIM_MF2 $noreg, %x, 9, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
%y:vrnov0 = PseudoVMERGE_VIM_MF2 $noreg, %x, 9, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1382,10 +1382,10 @@ name: vmerge_vxm
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vxm
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VXM_M1 $noreg, %x, $noreg, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
%y:vrnov0 = PseudoVMERGE_VXM_M1 $noreg, %x, $noreg, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1394,10 +1394,10 @@ name: vmerge_vxm_incompatible_eew
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vxm_incompatible_eew
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VXM_M1 $noreg, %x, $noreg, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
%y:vrnov0 = PseudoVMERGE_VXM_M1 $noreg, %x, $noreg, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1406,10 +1406,10 @@ name: vmerge_vxm_incompatible_emul
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vxm_incompatible_emul
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VXM_MF2 $noreg, %x, $noreg, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
%y:vrnov0 = PseudoVMERGE_VXM_MF2 $noreg, %x, $noreg, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1418,10 +1418,10 @@ name: vmerge_vvm
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vvm
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, $noreg, %x, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
%y:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, $noreg, %x, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1430,10 +1430,10 @@ name: vmerge_vvm_incompatible_eew
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vvm_incompatible_eew
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, $noreg, %x, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
%y:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, $noreg, %x, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
@@ -1442,10 +1442,10 @@ name: vmerge_vvm_incompatible_emul
body: |
bb.0:
; CHECK-LABEL: name: vmerge_vvm_incompatible_emul
- ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+ ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VVM_MF2 $noreg, $noreg, %x, $v0, 1, 3 /* e8 */
; CHECK-NEXT: $v8 = COPY %y
- %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
%y:vrnov0 = PseudoVMERGE_VVM_MF2 $noreg, $noreg, %x, $v0, 1, 3 /* e8 */
$v8 = COPY %y
...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-optimizer-subreg-assert.mir b/llvm/test/CodeGen/RISCV/rvv/vl-optimizer-subreg-assert.mir
index b816741285b43..7525bf70e62d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-optimizer-subreg-assert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-optimizer-subreg-assert.mir
@@ -12,17 +12,17 @@ body: |
; CHECK-LABEL: name: vl_optimizer_subreg_assert
; CHECK: liveins: $v8m2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vmv0 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF
; CHECK-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 $noreg, killed [[DEF2]], [[DEF]], [[DEF1]], -1, 6 /* e64 */
; CHECK-NEXT: [[PseudoVREDMAXU_VS_M8_E64_:%[0-9]+]]:vr = PseudoVREDMAXU_VS_M8_E64 $noreg, [[PseudoVMERGE_VVM_M8_]], [[PseudoVMERGE_VVM_M8_]].sub_vrm1_0, -1, 6 /* e64 */, 1 /* ta, mu */
; CHECK-NEXT: [[PseudoVMV_X_S:%[0-9]+]]:gpr = PseudoVMV_X_S killed [[PseudoVREDMAXU_VS_M8_E64_]], 6 /* e64 */
; CHECK-NEXT: $x10 = COPY [[PseudoVMV_X_S]]
; CHECK-NEXT: PseudoRET implicit $x10
- %0:vrm8 = IMPLICIT_DEF
+ %0:vrm8nov0 = IMPLICIT_DEF
%1:vmv0 = IMPLICIT_DEF
- %2:vrm8 = IMPLICIT_DEF
+ %2:vrm8nov0 = IMPLICIT_DEF
%3:vrm8nov0 = PseudoVMERGE_VVM_M8 $noreg, killed %2, %0, %1, -1, 6 /* e64 */
%4:vr = PseudoVREDMAXU_VS_M8_E64 $noreg, %3, %3.sub_vrm1_0, -1, 6 /* e64 */, 1 /* ta, mu */
%5:gpr = PseudoVMV_X_S killed %4, 6 /* e64 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
index 374afa3aafdea..338732c53aa3e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
@@ -15,7 +15,7 @@ body: |
; CHECK-NEXT: %y:vrnov0 = PseudoVLE32_V_M1_MASK %passthru, $noreg, %mask, %avl, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1)
%avl:gprnox0 = COPY $x8
%passthru:vrnov0 = COPY $v8
- %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
+ %x:vrnov0 = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
%mask:vmv0 = COPY $v0
%y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
...
@@ -32,8 +32,8 @@ body: |
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %y:vrnov0 = PseudoVLE32_V_M1_MASK %false, $noreg, %mask, %avl, 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
%avl:gprnox0 = COPY $x8
- %false:vr = COPY $v8
- %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
+ %false:vrnov0 = COPY $v8
+ %x:vrnov0 = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
%mask:vmv0 = COPY $v0
%y:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %x, %mask, %avl, 5 /* e32 */
...
@@ -50,7 +50,7 @@ body: |
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %y:vrnov0 = PseudoVLE32_V_M1_MASK %passthru, $noreg, %mask, %avl, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1)
%avl:gprnox0 = COPY $x8
- %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
+ %x:vrnov0 = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
%passthru:vrnov0 = COPY $v8
%mask:vmv0 = COPY $v0
%y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
@@ -68,7 +68,7 @@ body: |
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %y:vrnov0 = PseudoVNCLIPU_WV_MF2_MASK %passthru, $noreg, $noreg, %mask, 0, %avl, 5 /* e32 */, 0 /* tu, mu */, implicit-def $vxsat
%avl:gprnox0 = COPY $x8
- %x:vr = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5, 3, implicit-def $vxsat
+ %x:vrnov0 = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5, 3, implicit-def $vxsat
%passthru:vrnov0 = COPY $v8
%mask:vmv0 = COPY $v0
%y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
@@ -82,13 +82,13 @@ body: |
; CHECK: liveins: $x8, $v0, $v8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %avl:gprnox0 = COPY $x8
- ; CHECK-NEXT: %x:vr = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5 /* e32 */, 3 /* ta, ma */, implicit-def $vxsat
+ ; CHECK-NEXT: %x:vrnov0 = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5 /* e32 */, 3 /* ta, ma */, implicit-def $vxsat
; CHECK-NEXT: %vxsat:gpr = COPY $vxsat
; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
%avl:gprnox0 = COPY $x8
- %x:vr = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5, 3, implicit-def $vxsat
+ %x:vrnov0 = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5, 3, implicit-def $vxsat
%vxsat:gpr = COPY $vxsat
%passthru:vrnov0 = COPY $v8
%mask:vmv0 = COPY $v0
@@ -116,3 +116,23 @@ body: |
%vfmadd:vrnov0 = nofpexcept PseudoVFMADD_VV_M1_E32 %x, %y, %passthru, 7, -1, 5 /* e32 */, 3 /* ta, ma */, implicit $frm
%vmerge:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %vfmadd, %mask, %avl, 5
...
+---
+name: true_copy
+body: |
+ bb.0:
+ liveins: $x8, $v0, $v8
+ ; CHECK-LABEL: name: true_copy
+ ; CHECK: liveins: $x8, $v0, $v8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %avl:gprnox0 = COPY $x8
+ ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
+ ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+ ; CHECK-NEXT: %z:vrnov0 = PseudoVLE32_V_M1_MASK %passthru, $noreg, %mask, %avl, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1)
+ ; CHECK-NEXT: %y:vrnov0 = COPY %z
+ %avl:gprnox0 = COPY $x8
+ %passthru:vrnov0 = COPY $v8
+ %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size)
+ %mask:vmv0 = COPY $v0
+ %y:vrnov0 = COPY %x
+ %z:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %y, %mask, %avl, 5 /* e32 */
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 9c3e96d818556..95232e734bb18 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -163,7 +163,7 @@ body: |
; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
; CHECK-NEXT: %mask:vmv0 = COPY $v0
; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
- %passthru:vr = COPY $v8
+ %passthru:vrnov0 = COPY $v8
%mask:vmv0 = COPY $v0
%x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %passthru, $noreg, %mask, 4, 5 /* e32 */
%z:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 0 /* tu, mu */
More information about the llvm-commits
mailing list