[llvm] [RISCV] Set a barrier between mask producer and user of V0 (PR #114012)
Pengcheng Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 31 03:18:26 PDT 2024
https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/114012
>From 327d658bf32b18126429a8419947a68cebbb324e Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Mon, 28 Oct 2024 12:14:23 +0800
Subject: [PATCH 1/4] [RISCV] Set a barrier between mask producer and user of
V0
Here we add a scheduling mutation in pre-ra scheduling, which will
adds an artificial dependency edge between mask producer and its
previous nearest instruction that uses V0 register.
This prevents making live intervals of mask registers longer and as
a consequence we can reduce some spills/moves.
>From the test changes, we can see some improvements and also some
regressions (more vtype toggles).
Partially fixes #113489.
---
llvm/lib/Target/RISCV/CMakeLists.txt | 1 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 6 +
llvm/lib/Target/RISCV/RISCVTargetMachine.h | 4 +
.../RISCV/RISCVVectorMaskDAGMutation.cpp | 102 +++
.../RISCV/rvv/constant-folding-crash.ll | 18 +-
.../RISCV/rvv/fixed-vectors-fmaximum-vp.ll | 214 +++----
.../RISCV/rvv/fixed-vectors-fmaximum.ll | 186 +++---
.../RISCV/rvv/fixed-vectors-fminimum-vp.ll | 214 +++----
.../RISCV/rvv/fixed-vectors-fminimum.ll | 186 +++---
...fixed-vectors-interleaved-access-zve32x.ll | 70 +--
.../RISCV/rvv/fixed-vectors-reduction-fp.ll | 404 +++++-------
.../RISCV/rvv/fixed-vectors-rint-vp.ll | 15 +-
.../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll | 587 +++++++++++-------
llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 436 ++++++-------
.../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll | 587 +++++++++++-------
llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 436 ++++++-------
llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll | 74 +--
llvm/test/CodeGen/RISCV/rvv/vmfeq.ll | 72 +--
llvm/test/CodeGen/RISCV/rvv/vmfge.ll | 72 +--
llvm/test/CodeGen/RISCV/rvv/vmfgt.ll | 72 +--
llvm/test/CodeGen/RISCV/rvv/vmfle.ll | 72 +--
llvm/test/CodeGen/RISCV/rvv/vmflt.ll | 72 +--
llvm/test/CodeGen/RISCV/rvv/vmfne.ll | 72 +--
llvm/test/CodeGen/RISCV/rvv/vmseq.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsge.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsgt.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsle.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsleu.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmslt.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsltu.ll | 106 ++--
llvm/test/CodeGen/RISCV/rvv/vmsne.ll | 106 ++--
33 files changed, 2470 insertions(+), 2562 deletions(-)
create mode 100644 llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index fd049d1a57860e..b95ad9dd428cc9 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen
RISCVTargetMachine.cpp
RISCVTargetObjectFile.cpp
RISCVTargetTransformInfo.cpp
+ RISCVVectorMaskDAGMutation.cpp
RISCVVectorPeephole.cpp
RISCVVLOptimizer.cpp
RISCVZacasABIFix.cpp
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 089dc6c529193d..b88bd18e7c8585 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -360,6 +360,12 @@ class RISCVPassConfig : public TargetPassConfig {
DAG->addMutation(createStoreClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
}
+
+ const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
+ if (ST.hasVInstructions()) {
+ DAG = DAG ? DAG : createGenericSchedLive(C);
+ DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI));
+ }
return DAG;
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index ce7b7907e1f3af..1a37891f847ae6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -61,6 +61,10 @@ class RISCVTargetMachine : public LLVMTargetMachine {
SMRange &SourceRange) const override;
void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
+
+std::unique_ptr<ScheduleDAGMutation>
+createRISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI);
+
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
new file mode 100644
index 00000000000000..7e83509594507b
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -0,0 +1,102 @@
+//===- RISCVVectorMaskDAGMutation.cpp - RISCV Vector Mask DAGMutation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A schedule mutation that add a dependency between masks producing
+// instructions and masked instructions, so that we will not extend the live
+// interval of mask register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace llvm {
+
+static inline bool isVectorMaskProducer(const MachineInstr *MI) {
+ switch (RISCV::getRVVMCOpcode(MI->getOpcode())) {
+ // Vector Mask Instructions
+ case RISCV::VMAND_MM:
+ case RISCV::VMNAND_MM:
+ case RISCV::VMANDN_MM:
+ case RISCV::VMXOR_MM:
+ case RISCV::VMOR_MM:
+ case RISCV::VMNOR_MM:
+ case RISCV::VMORN_MM:
+ case RISCV::VMXNOR_MM:
+ case RISCV::VMSBF_M:
+ case RISCV::VMSIF_M:
+ case RISCV::VMSOF_M:
+ case RISCV::VIOTA_M:
+ // Vector Integer Compare Instructions
+ case RISCV::VMSEQ_VV:
+ case RISCV::VMSEQ_VX:
+ case RISCV::VMSEQ_VI:
+ case RISCV::VMSNE_VV:
+ case RISCV::VMSNE_VX:
+ case RISCV::VMSNE_VI:
+ case RISCV::VMSLT_VV:
+ case RISCV::VMSLT_VX:
+ case RISCV::VMSLTU_VV:
+ case RISCV::VMSLTU_VX:
+ case RISCV::VMSLE_VV:
+ case RISCV::VMSLE_VX:
+ case RISCV::VMSLE_VI:
+ case RISCV::VMSLEU_VV:
+ case RISCV::VMSLEU_VX:
+ case RISCV::VMSLEU_VI:
+ case RISCV::VMSGTU_VX:
+ case RISCV::VMSGTU_VI:
+ case RISCV::VMSGT_VX:
+ case RISCV::VMSGT_VI:
+ // Vector Floating-Point Compare Instructions
+ case RISCV::VMFEQ_VV:
+ case RISCV::VMFEQ_VF:
+ case RISCV::VMFNE_VV:
+ case RISCV::VMFNE_VF:
+ case RISCV::VMFLT_VV:
+ case RISCV::VMFLT_VF:
+ case RISCV::VMFLE_VV:
+ case RISCV::VMFLE_VF:
+ case RISCV::VMFGT_VF:
+ case RISCV::VMFGE_VF:
+ return true;
+ }
+ return false;
+}
+
+class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
+private:
+ const TargetRegisterInfo *TRI;
+
+public:
+ RISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI) : TRI(TRI) {}
+
+ void apply(ScheduleDAGInstrs *DAG) override {
+ SUnit *NearestUseV0SU = nullptr;
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (MI->findRegisterUseOperand(RISCV::V0, TRI))
+ NearestUseV0SU = &SU;
+
+ if (NearestUseV0SU && NearestUseV0SU != &SU && isVectorMaskProducer(MI))
+ DAG->addEdge(&SU, SDep(NearestUseV0SU, SDep::Artificial));
+ }
+ }
+};
+
+std::unique_ptr<ScheduleDAGMutation>
+createRISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI) {
+ return std::make_unique<RISCVVectorMaskDAGMutation>(TRI);
+}
+
+} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index 7839b602706db1..113154c0f9855b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -19,19 +19,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
; RV32-LABEL: constant_folding_crash:
; RV32: # %bb.0: # %entry
; RV32-NEXT: lw a0, 8(a0)
+; RV32-NEXT: vmv1r.v v10, v0
; RV32-NEXT: andi a0, a0, 1
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vmsne.vi v10, v10, 0
-; RV32-NEXT: vmv1r.v v11, v0
-; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vmv.v.x v11, a0
+; RV32-NEXT: vmsne.vi v0, v11, 0
; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV32-NEXT: vmerge.vvm v8, v9, v8, v0
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmv1r.v v0, v11
+; RV32-NEXT: vmv1r.v v0, v10
; RV32-NEXT: vmerge.vim v8, v8, 1, v0
; RV32-NEXT: vrgather.vi v9, v8, 0
; RV32-NEXT: vmsne.vi v0, v9, 0
@@ -43,19 +42,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
; RV64-LABEL: constant_folding_crash:
; RV64: # %bb.0: # %entry
; RV64-NEXT: ld a0, 8(a0)
+; RV64-NEXT: vmv1r.v v12, v0
; RV64-NEXT: andi a0, a0, 1
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT: vmv.v.x v12, a0
-; RV64-NEXT: vmsne.vi v12, v12, 0
-; RV64-NEXT: vmv1r.v v13, v0
-; RV64-NEXT: vmv1r.v v0, v12
+; RV64-NEXT: vmv.v.x v13, a0
+; RV64-NEXT: vmsne.vi v0, v13, 0
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; RV64-NEXT: vmerge.vvm v8, v10, v8, v0
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmv1r.v v0, v13
+; RV64-NEXT: vmv1r.v v0, v12
; RV64-NEXT: vmerge.vim v8, v8, 1, v0
; RV64-NEXT: vrgather.vi v9, v8, 0
; RV64-NEXT: vmsne.vi v0, v9, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 51eb63f5f92212..216300b23f4524 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -52,11 +52,10 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked:
@@ -66,12 +65,11 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -124,11 +122,10 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked:
@@ -138,12 +135,11 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -198,11 +194,10 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked:
@@ -214,11 +209,10 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0
-; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -274,11 +268,10 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v14
+; ZVFH-NEXT: vfmax.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked:
@@ -290,11 +283,10 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0
-; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -326,11 +318,10 @@ define <2 x float> @vfmax_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x float> @llvm.vp.maximum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> splat (i1 true), i32 %evl)
ret <2 x float> %v
@@ -360,11 +351,10 @@ define <4 x float> @vfmax_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <4 x float> @llvm.vp.maximum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> splat (i1 true), i32 %evl)
ret <4 x float> %v
@@ -396,11 +386,10 @@ define <8 x float> @vfmax_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <8 x float> @llvm.vp.maximum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> splat (i1 true), i32 %evl)
ret <8 x float> %v
@@ -432,11 +421,10 @@ define <16 x float> @vfmax_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <16 x float> @llvm.vp.maximum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> splat (i1 true), i32 %evl)
ret <16 x float> %v
@@ -466,11 +454,10 @@ define <2 x double> @vfmax_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x double> @llvm.vp.maximum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> splat (i1 true), i32 %evl)
ret <2 x double> %v
@@ -502,11 +489,10 @@ define <4 x double> @vfmax_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <4 x double> @llvm.vp.maximum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> splat (i1 true), i32 %evl)
ret <4 x double> %v
@@ -538,11 +524,10 @@ define <8 x double> @vfmax_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <8 x double> @llvm.vp.maximum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> splat (i1 true), i32 %evl)
ret <8 x double> %v
@@ -587,9 +572,8 @@ define <16 x double> @vfmax_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -710,21 +694,25 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: li a3, 40
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
@@ -733,52 +721,66 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a2, -16
+; CHECK-NEXT: sltu a2, a2, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 5
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: vfmax.vv v16, v16, v24
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v24, v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v8
+; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: li a1, 40
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 02c2fafc89785f..5b4959409d63c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -15,25 +15,25 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v2f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -48,25 +48,25 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v4f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -81,25 +81,25 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v8f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -114,25 +114,25 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v14
+; ZVFH-NEXT: vfmax.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v16f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -147,11 +147,10 @@ define <2 x float> @vfmax_v2f32_vv(<2 x float> %a, <2 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
ret <2 x float> %v
@@ -164,11 +163,10 @@ define <4 x float> @vfmax_v4f32_vv(<4 x float> %a, <4 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %v
@@ -181,11 +179,10 @@ define <8 x float> @vfmax_v8f32_vv(<8 x float> %a, <8 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <8 x float> @llvm.maximum.v8f32(<8 x float> %a, <8 x float> %b)
ret <8 x float> %v
@@ -198,11 +195,10 @@ define <16 x float> @vfmax_v16f32_vv(<16 x float> %a, <16 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
ret <16 x float> %v
@@ -215,11 +211,10 @@ define <2 x double> @vfmax_v2f64_vv(<2 x double> %a, <2 x double> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %v
@@ -232,11 +227,10 @@ define <4 x double> @vfmax_v4f64_vv(<4 x double> %a, <4 x double> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
ret <4 x double> %v
@@ -249,11 +243,10 @@ define <8 x double> @vfmax_v8f64_vv(<8 x double> %a, <8 x double> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <8 x double> @llvm.maximum.v8f64(<8 x double> %a, <8 x double> %b)
ret <8 x double> %v
@@ -266,9 +259,8 @@ define <16 x double> @vfmax_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -302,13 +294,12 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
; ZVFH-LABEL: vfmax_v2f16_vv_nnana:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT: vfadd.vv v10, v8, v8
; ZVFH-NEXT: vmfeq.vv v0, v9, v9
-; ZVFH-NEXT: vmfeq.vv v8, v10, v10
-; ZVFH-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v8
-; ZVFH-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFH-NEXT: vfmax.vv v8, v11, v8
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vmerge.vvm v10, v9, v8, v0
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmerge.vvm v8, v8, v9, v0
+; ZVFH-NEXT: vfmax.vv v8, v10, v8
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnana:
@@ -319,16 +310,15 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -343,13 +333,12 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
; ZVFH-LABEL: vfmax_v2f16_vv_nnanb:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT: vfadd.vv v10, v9, v9
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v9, v10, v10
-; ZVFH-NEXT: vmerge.vvm v11, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v9
-; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
+; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnanb:
@@ -364,12 +353,11 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 03e0ac42c442c4..b11a48c2857e3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -52,11 +52,10 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_v2f16_unmasked:
@@ -66,12 +65,11 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -124,11 +122,10 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_v4f16_unmasked:
@@ -138,12 +135,11 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -198,11 +194,10 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_v8f16_unmasked:
@@ -214,11 +209,10 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0
-; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -274,11 +268,10 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v14
+; ZVFH-NEXT: vfmin.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_v16f16_unmasked:
@@ -290,11 +283,10 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0
-; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -326,11 +318,10 @@ define <2 x float> @vfmin_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x float> @llvm.vp.minimum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> splat (i1 true), i32 %evl)
ret <2 x float> %v
@@ -360,11 +351,10 @@ define <4 x float> @vfmin_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <4 x float> @llvm.vp.minimum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> splat (i1 true), i32 %evl)
ret <4 x float> %v
@@ -396,11 +386,10 @@ define <8 x float> @vfmin_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <8 x float> @llvm.vp.minimum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> splat (i1 true), i32 %evl)
ret <8 x float> %v
@@ -432,11 +421,10 @@ define <16 x float> @vfmin_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <16 x float> @llvm.vp.minimum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> splat (i1 true), i32 %evl)
ret <16 x float> %v
@@ -466,11 +454,10 @@ define <2 x double> @vfmin_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x double> @llvm.vp.minimum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> splat (i1 true), i32 %evl)
ret <2 x double> %v
@@ -502,11 +489,10 @@ define <4 x double> @vfmin_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <4 x double> @llvm.vp.minimum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> splat (i1 true), i32 %evl)
ret <4 x double> %v
@@ -538,11 +524,10 @@ define <8 x double> @vfmin_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <8 x double> @llvm.vp.minimum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> splat (i1 true), i32 %evl)
ret <8 x double> %v
@@ -587,9 +572,8 @@ define <16 x double> @vfmin_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -710,21 +694,25 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: li a3, 40
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
@@ -733,52 +721,66 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a1, a2, -16
+; CHECK-NEXT: sltu a2, a2, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 5
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: vfmin.vv v16, v16, v24
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v24, v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v8
+; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: li a1, 40
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index b15d697f0754ed..a9d3c99b8cde42 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -15,25 +15,25 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v2f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -48,25 +48,25 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v4f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -81,25 +81,25 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v8f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -114,25 +114,25 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) {
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v14
+; ZVFH-NEXT: vfmin.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v16f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -147,11 +147,10 @@ define <2 x float> @vfmin_v2f32_vv(<2 x float> %a, <2 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
ret <2 x float> %v
@@ -164,11 +163,10 @@ define <4 x float> @vfmin_v4f32_vv(<4 x float> %a, <4 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %v
@@ -181,11 +179,10 @@ define <8 x float> @vfmin_v8f32_vv(<8 x float> %a, <8 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <8 x float> @llvm.minimum.v8f32(<8 x float> %a, <8 x float> %b)
ret <8 x float> %v
@@ -198,11 +195,10 @@ define <16 x float> @vfmin_v16f32_vv(<16 x float> %a, <16 x float> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
ret <16 x float> %v
@@ -215,11 +211,10 @@ define <2 x double> @vfmin_v2f64_vv(<2 x double> %a, <2 x double> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %v
@@ -232,11 +227,10 @@ define <4 x double> @vfmin_v4f64_vv(<4 x double> %a, <4 x double> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
ret <4 x double> %v
@@ -249,11 +243,10 @@ define <8 x double> @vfmin_v8f64_vv(<8 x double> %a, <8 x double> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <8 x double> @llvm.minimum.v8f64(<8 x double> %a, <8 x double> %b)
ret <8 x double> %v
@@ -266,9 +259,8 @@ define <16 x double> @vfmin_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -302,13 +294,12 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
; ZVFH-LABEL: vfmin_v2f16_vv_nnana:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT: vfadd.vv v10, v8, v8
; ZVFH-NEXT: vmfeq.vv v0, v9, v9
-; ZVFH-NEXT: vmfeq.vv v8, v10, v10
-; ZVFH-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v8
-; ZVFH-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFH-NEXT: vfmin.vv v8, v11, v8
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vmerge.vvm v10, v9, v8, v0
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmerge.vvm v8, v8, v9, v0
+; ZVFH-NEXT: vfmin.vv v8, v10, v8
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnana:
@@ -319,16 +310,15 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -343,13 +333,12 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
; ZVFH-LABEL: vfmin_v2f16_vv_nnanb:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFH-NEXT: vfadd.vv v10, v9, v9
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v9, v10, v10
-; ZVFH-NEXT: vmerge.vvm v11, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v9
-; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
+; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnanb:
@@ -364,12 +353,11 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index af46849ae08719..363008233b8d86 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,24 +7,20 @@
define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-LABEL: load_large_vector:
; ZVE32X: # %bb.0:
-; ZVE32X-NEXT: ld a1, 0(a0)
-; ZVE32X-NEXT: ld a2, 8(a0)
-; ZVE32X-NEXT: ld a3, 24(a0)
-; ZVE32X-NEXT: ld a4, 32(a0)
-; ZVE32X-NEXT: ld a5, 48(a0)
-; ZVE32X-NEXT: ld a6, 56(a0)
-; ZVE32X-NEXT: ld a7, 72(a0)
-; ZVE32X-NEXT: ld a0, 80(a0)
-; ZVE32X-NEXT: xor a3, a3, a4
-; ZVE32X-NEXT: snez a3, a3
+; ZVE32X-NEXT: ld a1, 24(a0)
+; ZVE32X-NEXT: ld a2, 32(a0)
+; ZVE32X-NEXT: ld a3, 8(a0)
+; ZVE32X-NEXT: ld a4, 0(a0)
+; ZVE32X-NEXT: xor a1, a1, a2
+; ZVE32X-NEXT: snez a1, a1
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmv.s.x v8, a3
+; ZVE32X-NEXT: vmv.s.x v8, a1
; ZVE32X-NEXT: vand.vi v8, v8, 1
; ZVE32X-NEXT: vmsne.vi v0, v8, 0
-; ZVE32X-NEXT: vmv.s.x v9, zero
-; ZVE32X-NEXT: vmerge.vim v8, v9, 1, v0
-; ZVE32X-NEXT: xor a1, a1, a2
-; ZVE32X-NEXT: snez a1, a1
+; ZVE32X-NEXT: vmv.s.x v8, zero
+; ZVE32X-NEXT: vmerge.vim v9, v8, 1, v0
+; ZVE32X-NEXT: xor a3, a4, a3
+; ZVE32X-NEXT: snez a1, a3
; ZVE32X-NEXT: vmv.s.x v10, a1
; ZVE32X-NEXT: vand.vi v10, v10, 1
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
@@ -32,38 +28,36 @@ define <4 x i1> @load_large_vector(ptr %p) {
; ZVE32X-NEXT: vmv.v.i v10, 0
; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma
-; ZVE32X-NEXT: vslideup.vi v11, v8, 1
+; ZVE32X-NEXT: vslideup.vi v11, v9, 1
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
-; ZVE32X-NEXT: xor a1, a5, a6
+; ZVE32X-NEXT: ld a1, 48(a0)
+; ZVE32X-NEXT: ld a2, 56(a0)
+; ZVE32X-NEXT: ld a3, 72(a0)
+; ZVE32X-NEXT: ld a0, 80(a0)
+; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
+; ZVE32X-NEXT: xor a1, a1, a2
; ZVE32X-NEXT: snez a1, a1
-; ZVE32X-NEXT: vmv.s.x v8, a1
+; ZVE32X-NEXT: vmv.s.x v11, a1
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vand.vi v8, v8, 1
-; ZVE32X-NEXT: vmsne.vi v8, v8, 0
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0
-; ZVE32X-NEXT: vmv1r.v v0, v8
-; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmerge.vim v8, v9, 1, v0
+; ZVE32X-NEXT: vand.vi v11, v11, 1
+; ZVE32X-NEXT: vmsne.vi v0, v11, 0
+; ZVE32X-NEXT: vmerge.vim v11, v8, 1, v0
; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
-; ZVE32X-NEXT: vslideup.vi v11, v8, 2
+; ZVE32X-NEXT: vslideup.vi v9, v11, 2
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmsne.vi v0, v11, 0
-; ZVE32X-NEXT: xor a0, a7, a0
+; ZVE32X-NEXT: vmsne.vi v0, v9, 0
+; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
+; ZVE32X-NEXT: xor a0, a3, a0
; ZVE32X-NEXT: snez a0, a0
-; ZVE32X-NEXT: vmv.s.x v8, a0
-; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vand.vi v8, v8, 1
-; ZVE32X-NEXT: vmsne.vi v8, v8, 0
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmerge.vim v10, v10, 1, v0
-; ZVE32X-NEXT: vmv1r.v v0, v8
+; ZVE32X-NEXT: vmv.s.x v10, a0
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT: vmerge.vim v8, v9, 1, v0
-; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT: vslideup.vi v10, v8, 3
+; ZVE32X-NEXT: vand.vi v10, v10, 1
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
+; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
+; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT: vslideup.vi v9, v8, 3
+; ZVE32X-NEXT: vmsne.vi v0, v9, 0
; ZVE32X-NEXT: ret
;
; ZVE64X-LABEL: load_large_vector:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 4be680e272e5b9..0f585887ac8312 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1948,41 +1948,26 @@ declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>)
define float @vreduce_fminimum_v64f32(ptr %x) {
; CHECK-LABEL: vreduce_fminimum_v64f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: li a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB119_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
-; CHECK-NEXT: j .LBB119_3
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB119_2:
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
-; CHECK-NEXT: .LBB119_3:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <64 x float>, ptr %x
%red = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %v)
@@ -2023,73 +2008,61 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: addi a2, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v16, (a2)
+; CHECK-NEXT: vle32.v v24, (a2)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: vle32.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle32.v v24, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vle32.v v24, (a1)
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v24, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB121_2
@@ -2286,40 +2259,25 @@ declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)
define double @vreduce_fminimum_v32f64(ptr %x) {
; CHECK-LABEL: vreduce_fminimum_v32f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB131_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, %hi(.LCPI131_0)
; CHECK-NEXT: fld fa0, %lo(.LCPI131_0)(a0)
-; CHECK-NEXT: j .LBB131_3
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB131_2:
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
-; CHECK-NEXT: .LBB131_3:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <32 x double>, ptr %x
%red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v)
@@ -2358,73 +2316,61 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vle64.v v24, (a1)
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v24, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB133_2
@@ -2701,41 +2647,26 @@ declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>)
define float @vreduce_fmaximum_v64f32(ptr %x) {
; CHECK-LABEL: vreduce_fmaximum_v64f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: li a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB147_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
-; CHECK-NEXT: j .LBB147_3
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB147_2:
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
-; CHECK-NEXT: .LBB147_3:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <64 x float>, ptr %x
%red = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %v)
@@ -2776,73 +2707,61 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: addi a2, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v16, (a2)
+; CHECK-NEXT: vle32.v v24, (a2)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v8, (a1)
+; CHECK-NEXT: vle32.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle32.v v24, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vle32.v v24, (a1)
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v24, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB149_2
@@ -3039,40 +2958,25 @@ declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
define double @vreduce_fmaximum_v32f64(ptr %x) {
; CHECK-LABEL: vreduce_fmaximum_v32f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB159_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, %hi(.LCPI159_0)
; CHECK-NEXT: fld fa0, %lo(.LCPI159_0)(a0)
-; CHECK-NEXT: j .LBB159_3
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB159_2:
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
-; CHECK-NEXT: .LBB159_3:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <32 x double>, ptr %x
%red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v)
@@ -3111,73 +3015,61 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vle64.v v24, (a1)
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v24, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB161_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 1f856d04ca89fd..9d2bd4974a0b83 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -586,20 +586,17 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
-; CHECK-NEXT: addi a2, a0, -16
-; CHECK-NEXT: sltu a0, a0, a2
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a2
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfabs.v v24, v16
-; CHECK-NEXT: vmflt.vf v7, v24, fa5
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: addi a1, a0, -16
+; CHECK-NEXT: sltu a0, a0, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index d8c3ab27cfad12..2f92407b1965ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -22,15 +22,16 @@ define <vscale x 1 x bfloat> @vfmax_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscal
; CHECK-LABEL: vfmax_nxv1bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v9, v9
-; CHECK-NEXT: vmfeq.vv v8, v10, v10
-; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
-; CHECK-NEXT: vfmax.vv v9, v8, v11
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfmax.vv v9, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
; CHECK-NEXT: ret
@@ -44,15 +45,16 @@ define <vscale x 2 x bfloat> @vfmax_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscal
; CHECK-LABEL: vfmax_nxv2bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v9, v9
-; CHECK-NEXT: vmfeq.vv v8, v10, v10
-; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
-; CHECK-NEXT: vfmax.vv v9, v8, v11
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfmax.vv v9, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
; CHECK-NEXT: ret
@@ -66,15 +68,16 @@ define <vscale x 4 x bfloat> @vfmax_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscal
; CHECK-LABEL: vfmax_nxv4bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v12, v12
-; CHECK-NEXT: vmfeq.vv v8, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v12, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT: vfmax.vv v10, v8, v14
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT: vfmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: ret
@@ -88,15 +91,16 @@ define <vscale x 8 x bfloat> @vfmax_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscal
; CHECK-LABEL: vfmax_nxv8bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v8, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v16, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT: vfmax.vv v12, v8, v20
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT: vfmax.vv v12, v12, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
; CHECK-NEXT: ret
@@ -109,31 +113,19 @@ declare <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat>, <
define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
; CHECK-LABEL: vfmax_nxv16bf16_vv:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vfmax.vv v16, v16, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
ret <vscale x 16 x bfloat> %v
@@ -142,55 +134,177 @@ define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
declare <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; CHECK-LABEL: vfmax_nxv32bf16_vv:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv8r.v v0, v8
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v3, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v3
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: vfmax.vv v16, v8, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfmax_nxv32bf16_vv:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 5
+; ZVFH-NEXT: sub sp, sp, a0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmv8r.v v0, v8
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v24, v24
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmerge.vvm v8, v8, v24, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v16, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v28
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmerge.vvm v24, v16, v8, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfmax.vv v24, v16, v24
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfmax.vv v16, v8, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 5
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv8r.v v0, v8
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v28
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v24, v16, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
ret <vscale x 32 x bfloat> %v
}
@@ -202,25 +316,25 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_nxv1f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -235,25 +349,25 @@ define <vscale x 2 x half> @vfmax_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_nxv2f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -268,25 +382,25 @@ define <vscale x 4 x half> @vfmax_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_nxv4f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -301,25 +415,25 @@ define <vscale x 8 x half> @vfmax_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v14
+; ZVFH-NEXT: vfmax.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_nxv8f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -334,40 +448,27 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v16, v12, v12
-; ZVFH-NEXT: vmerge.vvm v20, v8, v12, v0
-; ZVFH-NEXT: vmv1r.v v0, v16
+; ZVFH-NEXT: vmerge.vvm v16, v8, v12, v0
+; ZVFH-NEXT: vmfeq.vv v0, v12, v12
; ZVFH-NEXT: vmerge.vvm v8, v12, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v20
+; ZVFH-NEXT: vfmax.vv v8, v8, v16
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_nxv16f16_vv:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmfeq.vv v7, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.maximum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
ret <vscale x 16 x half> %v
@@ -380,9 +481,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmv1r.v v0, v7
+; ZVFH-NEXT: vmfeq.vv v0, v16, v16
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmax.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -391,48 +491,82 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv8r.v v0, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmfeq.vv v3, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT: vfmax.vv v8, v8, v16
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vmfeq.vv v7, v8, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT: vfmax.vv v16, v8, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v24, v16, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -447,11 +581,10 @@ define <vscale x 1 x float> @vfmax_nxv1f32_vv(<vscale x 1 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x float> @llvm.maximum.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b)
ret <vscale x 1 x float> %v
@@ -464,11 +597,10 @@ define <vscale x 2 x float> @vfmax_nxv2f32_vv(<vscale x 2 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 2 x float> @llvm.maximum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
ret <vscale x 2 x float> %v
@@ -481,11 +613,10 @@ define <vscale x 4 x float> @vfmax_nxv4f32_vv(<vscale x 4 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 4 x float> @llvm.maximum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
ret <vscale x 4 x float> %v
@@ -498,11 +629,10 @@ define <vscale x 8 x float> @vfmax_nxv8f32_vv(<vscale x 8 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 8 x float> @llvm.maximum.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b)
ret <vscale x 8 x float> %v
@@ -515,9 +645,8 @@ define <vscale x 16 x float> @vfmax_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -532,11 +661,10 @@ define <vscale x 1 x double> @vfmax_nxv1f64_vv(<vscale x 1 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x double> @llvm.maximum.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b)
ret <vscale x 1 x double> %v
@@ -549,11 +677,10 @@ define <vscale x 2 x double> @vfmax_nxv2f64_vv(<vscale x 2 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
ret <vscale x 2 x double> %v
@@ -566,11 +693,10 @@ define <vscale x 4 x double> @vfmax_nxv4f64_vv(<vscale x 4 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 4 x double> @llvm.maximum.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b)
ret <vscale x 4 x double> %v
@@ -583,9 +709,8 @@ define <vscale x 8 x double> @vfmax_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -632,16 +757,15 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -673,12 +797,11 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index dd01e1c1ee66d0..3c76bb6db601b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -48,12 +48,11 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v11, v11
-; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: vfmax.vv v9, v8, v9
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
@@ -96,12 +95,11 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v11, v11
-; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: vfmax.vv v9, v8, v9
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
@@ -148,11 +146,10 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v12, v12
-; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0
-; CHECK-NEXT: vfmax.vv v10, v8, v14
+; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT: vfmax.vv v10, v10, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: ret
@@ -198,11 +195,10 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v16, v16
-; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0
-; CHECK-NEXT: vfmax.vv v12, v8, v20
+; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT: vfmax.vv v12, v12, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
; CHECK-NEXT: ret
@@ -254,12 +250,6 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -267,20 +257,12 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vfmax.vv v16, v16, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.maximum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x bfloat> %v
@@ -441,9 +423,10 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: sub a3, a0, a1
@@ -456,45 +439,33 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v24, a2
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT: vmv4r.v v8, v16
+; CHECK-NEXT: vmfeq.vv v6, v24, v24, v0.t
+; CHECK-NEXT: vmv8r.v v8, v16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vfmax.vv v16, v16, v8, v0.t
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -504,7 +475,7 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB11_2:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -512,24 +483,33 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v3, v16, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v3
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT: vfmax.vv v16, v16, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -580,11 +560,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
@@ -594,12 +573,11 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -652,11 +630,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
@@ -666,12 +643,11 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -726,11 +702,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
@@ -742,11 +717,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0
-; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -802,11 +776,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v14
+; ZVFH-NEXT: vfmax.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
@@ -818,11 +791,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0
-; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -891,21 +863,14 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v16, v12, v12
-; ZVFH-NEXT: vmerge.vvm v20, v8, v12, v0
-; ZVFH-NEXT: vmv1r.v v0, v16
+; ZVFH-NEXT: vmerge.vvm v16, v8, v12, v0
+; ZVFH-NEXT: vmfeq.vv v0, v12, v12
; ZVFH-NEXT: vmerge.vvm v8, v12, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v20
+; ZVFH-NEXT: vfmax.vv v8, v8, v16
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -913,20 +878,12 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.vp.maximum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x half> %v
@@ -1114,9 +1071,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmv1r.v v0, v7
+; ZVFH-NEXT: vmfeq.vv v0, v16, v16
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmax.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -1126,9 +1082,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: li a2, 24
+; ZVFHMIN-NEXT: mul a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: sub a3, a0, a1
@@ -1141,45 +1098,33 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v24, a2
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
+; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v6, v24, v24, v0.t
+; ZVFHMIN-NEXT: vmv8r.v v8, v16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT: vmv1r.v v0, v12
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vmv1r.v v0, v6
; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v12, v16, v16, v0.t
; ZVFHMIN-NEXT: vmv1r.v v0, v12
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v24, v0
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -1189,7 +1134,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: mv a0, a1
; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1197,24 +1142,33 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0
-; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v16
; ZVFHMIN-NEXT: vmv8r.v v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -1246,11 +1200,10 @@ define <vscale x 1 x float> @vfmax_vv_nxv1f32_unmasked(<vscale x 1 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x float> @llvm.vp.maximum.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
ret <vscale x 1 x float> %v
@@ -1280,11 +1233,10 @@ define <vscale x 2 x float> @vfmax_vv_nxv2f32_unmasked(<vscale x 2 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 2 x float> @llvm.vp.maximum.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
ret <vscale x 2 x float> %v
@@ -1316,11 +1268,10 @@ define <vscale x 4 x float> @vfmax_vv_nxv4f32_unmasked(<vscale x 4 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 4 x float> @llvm.vp.maximum.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
ret <vscale x 4 x float> %v
@@ -1352,11 +1303,10 @@ define <vscale x 8 x float> @vfmax_vv_nxv8f32_unmasked(<vscale x 8 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 8 x float> @llvm.vp.maximum.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
ret <vscale x 8 x float> %v
@@ -1386,11 +1336,10 @@ define <vscale x 1 x double> @vfmax_vv_nxv1f64_unmasked(<vscale x 1 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x double> @llvm.vp.maximum.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
ret <vscale x 1 x double> %v
@@ -1422,11 +1371,10 @@ define <vscale x 2 x double> @vfmax_vv_nxv2f64_unmasked(<vscale x 2 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 2 x double> @llvm.vp.maximum.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
ret <vscale x 2 x double> %v
@@ -1458,11 +1406,10 @@ define <vscale x 4 x double> @vfmax_vv_nxv4f64_unmasked(<vscale x 4 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 4 x double> @llvm.vp.maximum.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
ret <vscale x 4 x double> %v
@@ -1507,9 +1454,8 @@ define <vscale x 8 x double> @vfmax_vv_nxv8f64_unmasked(<vscale x 8 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -1698,64 +1644,92 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v24, (a3)
+; CHECK-NEXT: vl8re64.v v16, (a3)
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a3, a3, a4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: sub a3, a2, a1
; CHECK-NEXT: sltu a4, a2, a3
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmv8r.v v16, v24
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a3, a3, a4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a3, a3, a4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vl8re64.v v16, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: vfmax.vv v8, v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB41_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB41_2:
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v24, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a0, a0, a3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a2, a1, .LBB41_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB41_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index 2371840002f40b..c0707d267f5af0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -22,15 +22,16 @@ define <vscale x 1 x bfloat> @vfmin_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscal
; CHECK-LABEL: vfmin_nxv1bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v9, v9
-; CHECK-NEXT: vmfeq.vv v8, v10, v10
-; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
-; CHECK-NEXT: vfmin.vv v9, v8, v11
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfmin.vv v9, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
; CHECK-NEXT: ret
@@ -44,15 +45,16 @@ define <vscale x 2 x bfloat> @vfmin_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscal
; CHECK-LABEL: vfmin_nxv2bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v9, v9
-; CHECK-NEXT: vmfeq.vv v8, v10, v10
-; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
-; CHECK-NEXT: vfmin.vv v9, v8, v11
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfmin.vv v9, v8, v9
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
; CHECK-NEXT: ret
@@ -66,15 +68,16 @@ define <vscale x 4 x bfloat> @vfmin_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscal
; CHECK-LABEL: vfmin_nxv4bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v12, v12
-; CHECK-NEXT: vmfeq.vv v8, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v12, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT: vfmin.vv v10, v8, v14
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT: vfmin.vv v10, v10, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: ret
@@ -88,15 +91,16 @@ define <vscale x 8 x bfloat> @vfmin_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscal
; CHECK-LABEL: vfmin_nxv8bf16_vv:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v8, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v16, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT: vfmin.vv v12, v8, v20
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT: vfmin.vv v12, v12, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
; CHECK-NEXT: ret
@@ -109,31 +113,19 @@ declare <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat>, <
define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
; CHECK-LABEL: vfmin_nxv16bf16_vv:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vfmin.vv v16, v16, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
ret <vscale x 16 x bfloat> %v
@@ -142,55 +134,177 @@ define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
declare <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; CHECK-LABEL: vfmin_nxv32bf16_vv:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv8r.v v0, v8
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v3, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v3
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: vfmin.vv v16, v8, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+; ZVFH-LABEL: vfmin_nxv32bf16_vv:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 5
+; ZVFH-NEXT: sub sp, sp, a0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmv8r.v v0, v8
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v24, v24
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmerge.vvm v8, v8, v24, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmfeq.vv v0, v16, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v28
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vmerge.vvm v24, v16, v8, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfmin.vv v24, v16, v24
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfmin.vv v16, v8, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 5
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv8r.v v0, v8
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v28
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v24, v16, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
ret <vscale x 32 x bfloat> %v
}
@@ -202,25 +316,25 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_nxv1f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -235,25 +349,25 @@ define <vscale x 2 x half> @vfmin_nxv2f16_vv(<vscale x 2 x half> %a, <vscale x 2
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_nxv2f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0
-; ZVFHMIN-NEXT: vfmin.vv v9, v8, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
; ZVFHMIN-NEXT: ret
@@ -268,25 +382,25 @@ define <vscale x 4 x half> @vfmin_nxv4f16_vv(<vscale x 4 x half> %a, <vscale x 4
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_nxv4f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT: vmerge.vvm v14, v12, v10, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
-; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -301,25 +415,25 @@ define <vscale x 8 x half> @vfmin_nxv8f16_vv(<vscale x 8 x half> %a, <vscale x 8
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v14
+; ZVFH-NEXT: vfmin.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_nxv8f16_vv:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v20, v16, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
-; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -334,40 +448,27 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v16, v12, v12
-; ZVFH-NEXT: vmerge.vvm v20, v8, v12, v0
-; ZVFH-NEXT: vmv1r.v v0, v16
+; ZVFH-NEXT: vmerge.vvm v16, v8, v12, v0
+; ZVFH-NEXT: vmfeq.vv v0, v12, v12
; ZVFH-NEXT: vmerge.vvm v8, v12, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v20
+; ZVFH-NEXT: vfmin.vv v8, v8, v16
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_nxv16f16_vv:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmfeq.vv v7, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.minimum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
ret <vscale x 16 x half> %v
@@ -380,9 +481,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmv1r.v v0, v7
+; ZVFH-NEXT: vmfeq.vv v0, v16, v16
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmin.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -391,48 +491,82 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv8r.v v0, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmfeq.vv v3, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT: vfmin.vv v8, v8, v16
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vmfeq.vv v7, v8, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT: vfmin.vv v16, v8, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v24, v16, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -447,11 +581,10 @@ define <vscale x 1 x float> @vfmin_nxv1f32_vv(<vscale x 1 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x float> @llvm.minimum.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b)
ret <vscale x 1 x float> %v
@@ -464,11 +597,10 @@ define <vscale x 2 x float> @vfmin_nxv2f32_vv(<vscale x 2 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 2 x float> @llvm.minimum.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
ret <vscale x 2 x float> %v
@@ -481,11 +613,10 @@ define <vscale x 4 x float> @vfmin_nxv4f32_vv(<vscale x 4 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 4 x float> @llvm.minimum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
ret <vscale x 4 x float> %v
@@ -498,11 +629,10 @@ define <vscale x 8 x float> @vfmin_nxv8f32_vv(<vscale x 8 x float> %a, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 8 x float> @llvm.minimum.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b)
ret <vscale x 8 x float> %v
@@ -515,9 +645,8 @@ define <vscale x 16 x float> @vfmin_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -532,11 +661,10 @@ define <vscale x 1 x double> @vfmin_nxv1f64_vv(<vscale x 1 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x double> @llvm.minimum.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b)
ret <vscale x 1 x double> %v
@@ -549,11 +677,10 @@ define <vscale x 2 x double> @vfmin_nxv2f64_vv(<vscale x 2 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
ret <vscale x 2 x double> %v
@@ -566,11 +693,10 @@ define <vscale x 4 x double> @vfmin_nxv4f64_vv(<vscale x 4 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 4 x double> @llvm.minimum.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b)
ret <vscale x 4 x double> %v
@@ -583,9 +709,8 @@ define <vscale x 8 x double> @vfmin_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -632,16 +757,15 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
; ZVFHMIN-NEXT: vfadd.vv v8, v10, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v10, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -673,12 +797,11 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v9, v0
+; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v9, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 85cac8d1870594..9750a9b5477c92 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -48,12 +48,11 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v11, v11
-; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: vfmin.vv v9, v8, v9
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
@@ -96,12 +95,11 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v11, v11
-; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: vfmin.vv v9, v8, v9
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
@@ -148,11 +146,10 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v12, v12
-; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0
-; CHECK-NEXT: vfmin.vv v10, v8, v14
+; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vmerge.vvm v10, v12, v10, v0
+; CHECK-NEXT: vfmin.vv v10, v10, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: ret
@@ -198,11 +195,10 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v16, v16
-; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0
-; CHECK-NEXT: vfmin.vv v12, v8, v20
+; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v12, v16, v12, v0
+; CHECK-NEXT: vfmin.vv v12, v12, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
; CHECK-NEXT: ret
@@ -254,12 +250,6 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -267,20 +257,12 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v8, v16
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vfmin.vv v16, v16, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.minimum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x bfloat> %v
@@ -441,9 +423,10 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: sub a3, a0, a1
@@ -456,45 +439,33 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v24, a2
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT: vmv4r.v v8, v16
+; CHECK-NEXT: vmfeq.vv v6, v24, v24, v0.t
+; CHECK-NEXT: vmv8r.v v8, v16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vfmin.vv v16, v16, v8, v0.t
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -504,7 +475,7 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB11_2:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -512,24 +483,33 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v3, v16, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v3
-; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT: vfmin.vv v16, v16, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v8
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -580,11 +560,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
@@ -594,12 +573,11 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -652,11 +630,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
@@ -666,12 +643,11 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -726,11 +702,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v11
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
@@ -742,11 +717,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0
-; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -802,11 +776,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v14
+; ZVFH-NEXT: vfmin.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
@@ -818,11 +791,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0
-; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -891,21 +863,14 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v16, v12, v12
-; ZVFH-NEXT: vmerge.vvm v20, v8, v12, v0
-; ZVFH-NEXT: vmv1r.v v0, v16
+; ZVFH-NEXT: vmerge.vvm v16, v8, v12, v0
+; ZVFH-NEXT: vmfeq.vv v0, v12, v12
; ZVFH-NEXT: vmerge.vvm v8, v12, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v20
+; ZVFH-NEXT: vfmin.vv v8, v8, v16
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -913,20 +878,12 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.vp.minimum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x half> %v
@@ -1114,9 +1071,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmv1r.v v0, v7
+; ZVFH-NEXT: vmfeq.vv v0, v16, v16
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmin.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -1126,9 +1082,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: li a2, 24
+; ZVFHMIN-NEXT: mul a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: sub a3, a0, a1
@@ -1141,45 +1098,33 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v24, a2
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
+; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v6, v24, v24, v0.t
+; ZVFHMIN-NEXT: vmv8r.v v8, v16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT: vmv1r.v v0, v12
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vmv1r.v v0, v6
; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v12, v16, v16, v0.t
; ZVFHMIN-NEXT: vmv1r.v v0, v12
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v24, v0
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -1189,7 +1134,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: mv a0, a1
; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1197,24 +1142,33 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0
-; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v16
; ZVFHMIN-NEXT: vmv8r.v v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -1246,11 +1200,10 @@ define <vscale x 1 x float> @vfmin_vv_nxv1f32_unmasked(<vscale x 1 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x float> @llvm.vp.minimum.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
ret <vscale x 1 x float> %v
@@ -1280,11 +1233,10 @@ define <vscale x 2 x float> @vfmin_vv_nxv2f32_unmasked(<vscale x 2 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 2 x float> @llvm.vp.minimum.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
ret <vscale x 2 x float> %v
@@ -1316,11 +1268,10 @@ define <vscale x 4 x float> @vfmin_vv_nxv4f32_unmasked(<vscale x 4 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 4 x float> @llvm.vp.minimum.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
ret <vscale x 4 x float> %v
@@ -1352,11 +1303,10 @@ define <vscale x 8 x float> @vfmin_vv_nxv8f32_unmasked(<vscale x 8 x float> %va,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 8 x float> @llvm.vp.minimum.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
ret <vscale x 8 x float> %v
@@ -1386,11 +1336,10 @@ define <vscale x 1 x double> @vfmin_vv_nxv1f64_unmasked(<vscale x 1 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v11
+; CHECK-NEXT: vfmin.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 1 x double> @llvm.vp.minimum.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
ret <vscale x 1 x double> %v
@@ -1422,11 +1371,10 @@ define <vscale x 2 x double> @vfmin_vv_nxv2f64_unmasked(<vscale x 2 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v14
+; CHECK-NEXT: vfmin.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 2 x double> @llvm.vp.minimum.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
ret <vscale x 2 x double> %v
@@ -1458,11 +1406,10 @@ define <vscale x 4 x double> @vfmin_vv_nxv4f64_unmasked(<vscale x 4 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v20
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 4 x double> @llvm.vp.minimum.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
ret <vscale x 4 x double> %v
@@ -1507,9 +1454,8 @@ define <vscale x 8 x double> @vfmin_vv_nxv8f64_unmasked(<vscale x 8 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -1698,64 +1644,92 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v24, (a3)
+; CHECK-NEXT: vl8re64.v v16, (a3)
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a3, a3, a4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; CHECK-NEXT: sub a3, a2, a1
; CHECK-NEXT: sltu a4, a2, a3
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmv8r.v v16, v24
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a3, a3, a4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a3, a3, a4
+; CHECK-NEXT: add a3, sp, a3
+; CHECK-NEXT: addi a3, a3, 16
+; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vl8re64.v v16, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: vfmin.vv v8, v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB41_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB41_2:
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v24, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a0, a0, a3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a2, a1, .LBB41_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB41_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
index 9d0234d2ec2fbc..76493d9514a8dc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
@@ -13,14 +13,13 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: srli a1, a0, 1
-; CHECK-NEXT: vsll.vv v10, v8, v9
-; CHECK-NEXT: vsra.vv v9, v10, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmv.v.x v10, a1
; CHECK-NEXT: slli a0, a0, 63
-; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vmerge.vxm v10, v10, a0, v0
+; CHECK-NEXT: vsll.vv v11, v8, v9
+; CHECK-NEXT: vsra.vv v9, v11, v9
+; CHECK-NEXT: vmsne.vv v0, v8, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
; CHECK-NEXT: ret
%tmp = call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
ret <2 x i64> %tmp
@@ -34,12 +33,11 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: addi a1, a0, -1
; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vmerge.vxm v10, v10, a0, v0
; CHECK-NEXT: vsll.vv v11, v8, v9
; CHECK-NEXT: vsra.vv v9, v11, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmerge.vxm v9, v10, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v9, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp
@@ -53,12 +51,11 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; CHECK-NEXT: lui a0, 8
; CHECK-NEXT: addi a1, a0, -1
; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vmerge.vxm v10, v10, a0, v0
; CHECK-NEXT: vsll.vv v11, v8, v9
; CHECK-NEXT: vsra.vv v9, v11, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmerge.vxm v9, v10, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v9, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
; CHECK-NEXT: ret
%tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
ret <8 x i16> %tmp
@@ -72,12 +69,11 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; CHECK-NEXT: li a0, 127
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmerge.vxm v10, v10, a0, v0
; CHECK-NEXT: vsll.vv v11, v8, v9
; CHECK-NEXT: vsra.vv v9, v11, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmerge.vxm v9, v10, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v11, v9, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
; CHECK-NEXT: ret
%tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
ret <16 x i8> %tmp
@@ -95,14 +91,13 @@ define <vscale x 2 x i64> @vec_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64>
; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: srli a1, a0, 1
-; CHECK-NEXT: vsll.vv v12, v8, v10
-; CHECK-NEXT: vsra.vv v14, v12, v10
-; CHECK-NEXT: vmsne.vv v10, v8, v14
-; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmv.v.x v12, a1
; CHECK-NEXT: slli a0, a0, 63
-; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
-; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT: vmerge.vxm v12, v12, a0, v0
+; CHECK-NEXT: vsll.vv v14, v8, v10
+; CHECK-NEXT: vsra.vv v10, v14, v10
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0
; CHECK-NEXT: ret
%tmp = call <vscale x 2 x i64> @llvm.sshl.sat.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
ret <vscale x 2 x i64> %tmp
@@ -116,12 +111,11 @@ define <vscale x 4 x i32> @vec_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32>
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: addi a1, a0, -1
; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vmerge.vxm v12, v12, a0, v0
; CHECK-NEXT: vsll.vv v14, v8, v10
-; CHECK-NEXT: vsra.vv v16, v14, v10
-; CHECK-NEXT: vmsne.vv v10, v8, v16
-; CHECK-NEXT: vmerge.vxm v8, v12, a0, v0
-; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmerge.vvm v8, v14, v8, v0
+; CHECK-NEXT: vsra.vv v10, v14, v10
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0
; CHECK-NEXT: ret
%tmp = call <vscale x 4 x i32> @llvm.sshl.sat.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
ret <vscale x 4 x i32> %tmp
@@ -135,12 +129,11 @@ define <vscale x 8 x i16> @vec_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16>
; CHECK-NEXT: lui a0, 8
; CHECK-NEXT: addi a1, a0, -1
; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vmerge.vxm v12, v12, a0, v0
; CHECK-NEXT: vsll.vv v14, v8, v10
-; CHECK-NEXT: vsra.vv v16, v14, v10
-; CHECK-NEXT: vmsne.vv v10, v8, v16
-; CHECK-NEXT: vmerge.vxm v8, v12, a0, v0
-; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmerge.vvm v8, v14, v8, v0
+; CHECK-NEXT: vsra.vv v10, v14, v10
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0
; CHECK-NEXT: ret
%tmp = call <vscale x 8 x i16> @llvm.sshl.sat.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
ret <vscale x 8 x i16> %tmp
@@ -154,12 +147,11 @@ define <vscale x 16 x i8> @vec_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8>
; CHECK-NEXT: li a0, 127
; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmerge.vxm v12, v12, a0, v0
; CHECK-NEXT: vsll.vv v14, v8, v10
-; CHECK-NEXT: vsra.vv v16, v14, v10
-; CHECK-NEXT: vmsne.vv v10, v8, v16
-; CHECK-NEXT: vmerge.vxm v8, v12, a0, v0
-; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmerge.vvm v8, v14, v8, v0
+; CHECK-NEXT: vsra.vv v10, v14, v10
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0
; CHECK-NEXT: ret
%tmp = call <vscale x 16 x i8> @llvm.sshl.sat.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
ret <vscale x 16 x i8> %tmp
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
index 2e5b67c93fce1a..37c2113b4196cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1f16(
define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmfeq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2f16(
define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmfeq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4f16(
define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmfeq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8f16(
define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv8f16_nxv8f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmfeq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v10
+; CHECK-NEXT: vmfeq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfeq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16f16(
define <vscale x 16 x i1> @intrinsic_vmfeq_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv16f16_nxv16f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmfeq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v12
+; CHECK-NEXT: vmfeq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfeq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1f32(
define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmfeq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2f32(
define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmfeq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4f32(
define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f32_nxv4f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmfeq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v10
+; CHECK-NEXT: vmfeq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfeq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8f32(
define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv8f32_nxv8f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmfeq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v12
+; CHECK-NEXT: vmfeq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfeq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1f64(
define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmfeq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmfeq.vv v0, v8, v9
; CHECK-NEXT: vmfeq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2f64(
define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2f64_nxv2f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmfeq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v10
+; CHECK-NEXT: vmfeq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfeq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4f64(
define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4f64_nxv4f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmfeq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfeq.vv v0, v8, v12
+; CHECK-NEXT: vmfeq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfeq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
index b5ca47707c8a82..35a087384e38f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1f16(
define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v9, v8
; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2f16(
define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v9, v8
; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4f16(
define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v9, v8
; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8f16(
define <vscale x 8 x i1> @intrinsic_vmfge_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv8f16_nxv8f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmfle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v10, v8
+; CHECK-NEXT: vmfle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16f16(
define <vscale x 16 x i1> @intrinsic_vmfge_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv16f16_nxv16f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmfle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v12, v8
+; CHECK-NEXT: vmfle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1f32(
define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v9, v8
; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2f32(
define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v9, v8
; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4f32(
define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f32_nxv4f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmfle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v10, v8
+; CHECK-NEXT: vmfle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8f32(
define <vscale x 8 x i1> @intrinsic_vmfge_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv8f32_nxv8f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmfle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v12, v8
+; CHECK-NEXT: vmfle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1f64(
define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v9, v8
; CHECK-NEXT: vmfle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2f64(
define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2f64_nxv2f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmfle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v10, v8
+; CHECK-NEXT: vmfle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4f64(
define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4f64_nxv4f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmfle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v12, v8
+; CHECK-NEXT: vmfle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
index 971249d38d1b26..bf5e547603c630 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1f16(
define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v9, v8
; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f16(
define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v9, v8
; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4f16(
define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v9, v8
; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8f16(
define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv8f16_nxv8f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmflt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v10, v8
+; CHECK-NEXT: vmflt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmflt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16f16(
define <vscale x 16 x i1> @intrinsic_vmfgt_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv16f16_nxv16f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmflt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v12, v8
+; CHECK-NEXT: vmflt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1f32(
define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v9, v8
; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f32(
define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v9, v8
; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4f32(
define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f32_nxv4f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmflt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v10, v8
+; CHECK-NEXT: vmflt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmflt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8f32(
define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv8f32_nxv8f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmflt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v12, v8
+; CHECK-NEXT: vmflt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1f64(
define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v9, v8
; CHECK-NEXT: vmflt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64(
define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2f64_nxv2f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmflt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v10, v8
+; CHECK-NEXT: vmflt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmflt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4f64(
define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4f64_nxv4f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmflt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v12, v8
+; CHECK-NEXT: vmflt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
index f19a181a365afc..df5e97d85ca502 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1f16(
define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v8, v9
; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2f16(
define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v8, v9
; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4f16(
define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v8, v9
; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8f16(
define <vscale x 8 x i1> @intrinsic_vmfle_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv8f16_nxv8f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmfle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v10
+; CHECK-NEXT: vmfle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16f16(
define <vscale x 16 x i1> @intrinsic_vmfle_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv16f16_nxv16f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmfle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v12
+; CHECK-NEXT: vmfle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1f32(
define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v8, v9
; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2f32(
define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v8, v9
; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4f32(
define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f32_nxv4f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmfle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v10
+; CHECK-NEXT: vmfle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8f32(
define <vscale x 8 x i1> @intrinsic_vmfle_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv8f32_nxv8f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmfle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v12
+; CHECK-NEXT: vmfle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1f64(
define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmfle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmfle.vv v0, v8, v9
; CHECK-NEXT: vmfle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2f64(
define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2f64_nxv2f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmfle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v10
+; CHECK-NEXT: vmfle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4f64(
define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4f64_nxv4f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmfle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfle.vv v0, v8, v12
+; CHECK-NEXT: vmfle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
index 0a046422193342..7065dcf502dded 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1f16(
define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v8, v9
; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2f16(
define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v8, v9
; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4f16(
define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v8, v9
; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8f16(
define <vscale x 8 x i1> @intrinsic_vmflt_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv8f16_nxv8f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmflt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v10
+; CHECK-NEXT: vmflt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmflt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16f16(
define <vscale x 16 x i1> @intrinsic_vmflt_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv16f16_nxv16f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmflt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v12
+; CHECK-NEXT: vmflt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1f32(
define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v8, v9
; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2f32(
define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v8, v9
; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4f32(
define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f32_nxv4f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmflt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v10
+; CHECK-NEXT: vmflt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmflt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8f32(
define <vscale x 8 x i1> @intrinsic_vmflt_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv8f32_nxv8f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmflt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v12
+; CHECK-NEXT: vmflt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1f64(
define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmflt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmflt.vv v0, v8, v9
; CHECK-NEXT: vmflt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2f64(
define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2f64_nxv2f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmflt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v10
+; CHECK-NEXT: vmflt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmflt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4f64(
define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4f64_nxv4f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmflt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmflt.vv v0, v8, v12
+; CHECK-NEXT: vmflt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
index 520099247e0f3d..dacbc846e75022 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1f16(
define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmfne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmfne.vv v0, v8, v9
; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2f16(
define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, <vscale x 2 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmfne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmfne.vv v0, v8, v9
; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4f16(
define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, <vscale x 4 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmfne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmfne.vv v0, v8, v9
; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -190,12 +187,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8f16(
define <vscale x 8 x i1> @intrinsic_vmfne_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv8f16_nxv8f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmfne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v10
+; CHECK-NEXT: vmfne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8f16(
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16f16(
define <vscale x 16 x i1> @intrinsic_vmfne_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, <vscale x 16 x half> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv16f16_nxv16f16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmfne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v12
+; CHECK-NEXT: vmfne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16f16(
@@ -294,10 +289,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1f32(
define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, <vscale x 1 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmfne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmfne.vv v0, v8, v9
; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -346,10 +340,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2f32(
define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, <vscale x 2 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmfne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmfne.vv v0, v8, v9
; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -398,12 +391,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4f32(
define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, <vscale x 4 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f32_nxv4f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmfne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v10
+; CHECK-NEXT: vmfne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4f32(
@@ -450,12 +442,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8f32(
define <vscale x 8 x i1> @intrinsic_vmfne_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, <vscale x 8 x float> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv8f32_nxv8f32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmfne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v12
+; CHECK-NEXT: vmfne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8f32(
@@ -502,10 +493,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1f64(
define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, <vscale x 1 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmfne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmfne.vv v0, v8, v9
; CHECK-NEXT: vmfne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -554,12 +544,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2f64(
define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, <vscale x 2 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2f64_nxv2f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmfne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v10
+; CHECK-NEXT: vmfne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmfne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2f64(
@@ -606,12 +595,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4f64(
define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, <vscale x 4 x double> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4f64_nxv4f64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmfne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmfne.vv v0, v8, v12
+; CHECK-NEXT: vmfne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmfne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
index c5769e0d1e5192..1fd2383c40d185 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmseq.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmseq.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmseq_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmseq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v10
+; CHECK-NEXT: vmseq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmseq.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmseq.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmseq_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmseq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v12
+; CHECK-NEXT: vmseq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmseq.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmseq.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmseq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v10
+; CHECK-NEXT: vmseq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmseq.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmseq.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmseq_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmseq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v12
+; CHECK-NEXT: vmseq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmseq.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmseq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v10
+; CHECK-NEXT: vmseq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmseq.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmseq_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmseq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v12
+; CHECK-NEXT: vmseq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmseq.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmseq.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmseq.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmseq.vv v0, v8, v9
; CHECK-NEXT: vmseq.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmseq.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmseq_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmseq.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v10
+; CHECK-NEXT: vmseq.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmseq.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmseq.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmseq.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmseq_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmseq_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmseq.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmseq.vv v0, v8, v12
+; CHECK-NEXT: vmseq.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmseq.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmseq.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
index 1ec304609699a6..c7e52d2d5bcdba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsge.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsge.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsge_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v10, v8
+; CHECK-NEXT: vmsle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsge.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsge.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsge_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v12, v8
+; CHECK-NEXT: vmsle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsge.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsge.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v10, v8
+; CHECK-NEXT: vmsle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsge.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsge.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsge_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v12, v8
+; CHECK-NEXT: vmsle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsge.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v10, v8
+; CHECK-NEXT: vmsle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsge.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsge.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsge_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v12, v8
+; CHECK-NEXT: vmsle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsge.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsge.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v9, v8
; CHECK-NEXT: vmsle.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsge.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsge_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v10, v8
+; CHECK-NEXT: vmsle.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsge.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsge.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsge_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsge_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v12, v8
+; CHECK-NEXT: vmsle.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsge.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
index 05cc7a9d8f7b42..56a07b29dbad6a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgeu.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgeu.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsgeu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v10, v8
+; CHECK-NEXT: vmsleu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsgeu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsgeu.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsgeu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v12, v8
+; CHECK-NEXT: vmsleu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsgeu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgeu.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v10, v8
+; CHECK-NEXT: vmsleu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsgeu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgeu.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsgeu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v12, v8
+; CHECK-NEXT: vmsleu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsgeu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v10, v8
+; CHECK-NEXT: vmsleu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsgeu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgeu.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsgeu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v12, v8
+; CHECK-NEXT: vmsleu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsgeu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgeu.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v9, v8
; CHECK-NEXT: vmsleu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgeu.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsgeu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v10, v8
+; CHECK-NEXT: vmsleu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsgeu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgeu.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsgeu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgeu_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v12, v8
+; CHECK-NEXT: vmsleu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsgeu.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
index 98a03a2c562806..d7dee2e1bc580e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgt.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgt.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsgt_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v10, v8
+; CHECK-NEXT: vmslt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsgt.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsgt.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsgt_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v12, v8
+; CHECK-NEXT: vmslt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsgt.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgt.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v10, v8
+; CHECK-NEXT: vmslt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsgt.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgt.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsgt_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v12, v8
+; CHECK-NEXT: vmslt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsgt.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v10, v8
+; CHECK-NEXT: vmslt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsgt.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgt.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsgt_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v12, v8
+; CHECK-NEXT: vmslt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsgt.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgt.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v9, v8
; CHECK-NEXT: vmslt.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsgt_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v10, v8
+; CHECK-NEXT: vmslt.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsgt.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgt.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsgt_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgt_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v12, v8
+; CHECK-NEXT: vmslt.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsgt.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
index 69b22573c289e5..fe9d522f6b401b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgtu.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgtu.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsgtu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v10, v8
+; CHECK-NEXT: vmsltu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsgtu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsgtu.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsgtu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v12, v8
+; CHECK-NEXT: vmsltu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsgtu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgtu.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v10, v8
+; CHECK-NEXT: vmsltu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsgtu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsgtu.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsgtu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v12, v8
+; CHECK-NEXT: vmsltu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsgtu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v10, v8
+; CHECK-NEXT: vmsltu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsgtu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsgtu.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsgtu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v12, v8
+; CHECK-NEXT: vmsltu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsgtu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsgtu.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v9, v8
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v9, v8
; CHECK-NEXT: vmsltu.vv v11, v10, v9, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgtu.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsgtu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v10, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v10, v8
+; CHECK-NEXT: vmsltu.vv v14, v12, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v12, v10, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsgtu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsgtu.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsgtu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsgtu_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v12, v8
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v12, v8
+; CHECK-NEXT: vmsltu.vv v20, v16, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v16, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsgtu.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
index c8794e1b63900f..bc98b31957b255 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsle.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsle.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsle_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v10
+; CHECK-NEXT: vmsle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsle.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsle.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsle_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v12
+; CHECK-NEXT: vmsle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsle.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsle.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v10
+; CHECK-NEXT: vmsle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsle.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsle.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsle_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v12
+; CHECK-NEXT: vmsle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsle.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v10
+; CHECK-NEXT: vmsle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsle.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsle_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v12
+; CHECK-NEXT: vmsle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsle.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsle.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsle.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsle.vv v0, v8, v9
; CHECK-NEXT: vmsle.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsle.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsle_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsle.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v10
+; CHECK-NEXT: vmsle.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsle.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsle.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsle.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsle_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsle_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsle.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsle.vv v0, v8, v12
+; CHECK-NEXT: vmsle.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsle.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsle.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
index 86dc48d51cc2bc..731989cfe15d95 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsleu.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsleu.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsleu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v10
+; CHECK-NEXT: vmsleu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsleu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsleu.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsleu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v12
+; CHECK-NEXT: vmsleu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsleu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsleu.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v10
+; CHECK-NEXT: vmsleu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsleu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsleu.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsleu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v12
+; CHECK-NEXT: vmsleu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsleu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v10
+; CHECK-NEXT: vmsleu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsleu.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsleu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v12
+; CHECK-NEXT: vmsleu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsleu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsleu.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsleu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsleu.vv v0, v8, v9
; CHECK-NEXT: vmsleu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsleu.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsleu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsleu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v10
+; CHECK-NEXT: vmsleu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsleu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsleu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsleu.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsleu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsleu_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsleu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsleu.vv v0, v8, v12
+; CHECK-NEXT: vmsleu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsleu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsleu.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
index 8d57f2adc53868..407f85b4f5996b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmslt_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v10
+; CHECK-NEXT: vmslt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmslt.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmslt.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmslt_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v12
+; CHECK-NEXT: vmslt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmslt.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v10
+; CHECK-NEXT: vmslt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmslt.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmslt_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v12
+; CHECK-NEXT: vmslt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmslt.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v10
+; CHECK-NEXT: vmslt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmslt.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmslt_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v12
+; CHECK-NEXT: vmslt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmslt.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmslt.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmslt.vv v0, v8, v9
; CHECK-NEXT: vmslt.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmslt_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmslt.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v10
+; CHECK-NEXT: vmslt.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmslt.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmslt.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmslt_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmslt_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmslt.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmslt.vv v0, v8, v12
+; CHECK-NEXT: vmslt.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmslt.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmslt.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
index 627b5943087137..e051b332018fd5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v10
+; CHECK-NEXT: vmsltu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsltu.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsltu.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsltu_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v12
+; CHECK-NEXT: vmsltu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsltu.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v10
+; CHECK-NEXT: vmsltu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsltu.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v12
+; CHECK-NEXT: vmsltu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsltu.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v10
+; CHECK-NEXT: vmsltu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsltu.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v12
+; CHECK-NEXT: vmsltu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsltu.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsltu.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsltu.vv v0, v8, v9
; CHECK-NEXT: vmsltu.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsltu.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v10
+; CHECK-NEXT: vmsltu.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsltu.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsltu.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsltu_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsltu.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsltu.vv v0, v8, v12
+; CHECK-NEXT: vmsltu.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsltu.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsltu.nxv4i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
index 47d1048f46cab4..1e21b847ed20d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll
@@ -34,10 +34,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i8(
define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -86,10 +85,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i8(
define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -138,10 +136,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i8(
define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -190,10 +187,9 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsne.mask.nxv8i8(
define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -242,12 +238,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsne.mask.nxv16i8(
define <vscale x 16 x i1> @intrinsic_vmsne_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv16i8_nxv16i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT: vmsne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmsne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsne.nxv16i8(
@@ -294,12 +289,11 @@ declare <vscale x 32 x i1> @llvm.riscv.vmsne.mask.nxv32i8(
define <vscale x 32 x i1> @intrinsic_vmsne_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i8> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv32i8_nxv32i8:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT: vmsne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v12
+; CHECK-NEXT: vmsne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 32 x i1> @llvm.riscv.vmsne.nxv32i8(
@@ -346,10 +340,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i16(
define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -398,10 +391,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i16(
define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -450,10 +442,9 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i16(
define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -502,12 +493,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsne.mask.nxv8i16(
define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i16_nxv8i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT: vmsne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmsne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsne.nxv8i16(
@@ -554,12 +544,11 @@ declare <vscale x 16 x i1> @llvm.riscv.vmsne.mask.nxv16i16(
define <vscale x 16 x i1> @intrinsic_vmsne_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i16> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv16i16_nxv16i16:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT: vmsne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v12
+; CHECK-NEXT: vmsne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 16 x i1> @llvm.riscv.vmsne.nxv16i16(
@@ -606,10 +595,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i32(
define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv1r.v v0, v11
; CHECK-NEXT: ret
@@ -658,10 +646,9 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i32(
define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -710,12 +697,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i32(
define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT: vmsne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmsne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i32(
@@ -762,12 +748,11 @@ declare <vscale x 8 x i1> @llvm.riscv.vmsne.mask.nxv8i32(
define <vscale x 8 x i1> @intrinsic_vmsne_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i32> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT: vmsne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v12
+; CHECK-NEXT: vmsne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 8 x i1> @llvm.riscv.vmsne.nxv8i32(
@@ -814,10 +799,9 @@ declare <vscale x 1 x i1> @llvm.riscv.vmsne.mask.nxv1i64(
define <vscale x 1 x i1> @intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT: vmsne.vv v8, v8, v9
; CHECK-NEXT: vmv1r.v v11, v0
-; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu
+; CHECK-NEXT: vmsne.vv v0, v8, v9
; CHECK-NEXT: vmsne.vv v11, v9, v10, v0.t
; CHECK-NEXT: vmv.v.v v0, v11
; CHECK-NEXT: ret
@@ -866,12 +850,11 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsne.mask.nxv2i64(
define <vscale x 2 x i1> @intrinsic_vmsne_mask_vv_nxv2i64_nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv2i64_nxv2i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v14, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT: vmsne.vv v14, v8, v10
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v10
+; CHECK-NEXT: vmsne.vv v14, v10, v12, v0.t
; CHECK-NEXT: vmv1r.v v0, v14
-; CHECK-NEXT: vmsne.vv v8, v10, v12, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 2 x i1> @llvm.riscv.vmsne.nxv2i64(
@@ -918,12 +901,11 @@ declare <vscale x 4 x i1> @llvm.riscv.vmsne.mask.nxv4i64(
define <vscale x 4 x i1> @intrinsic_vmsne_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i64> %3, iXLen %4) nounwind {
; CHECK-LABEL: intrinsic_vmsne_mask_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmv1r.v v20, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT: vmsne.vv v20, v8, v12
-; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmsne.vv v0, v8, v12
+; CHECK-NEXT: vmsne.vv v20, v12, v16, v0.t
; CHECK-NEXT: vmv1r.v v0, v20
-; CHECK-NEXT: vmsne.vv v8, v12, v16, v0.t
-; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: ret
entry:
%mask = call <vscale x 4 x i1> @llvm.riscv.vmsne.nxv4i64(
>From afaa7b865d2265e2d87e5f45d2e481af62e5c04d Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Wed, 30 Oct 2024 14:17:56 +0800
Subject: [PATCH 2/4] Remove VIOTA_M and add VMADC/VMSBC
---
llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
index 7e83509594507b..3ce289cf6d24cd 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -36,7 +36,17 @@ static inline bool isVectorMaskProducer(const MachineInstr *MI) {
case RISCV::VMSBF_M:
case RISCV::VMSIF_M:
case RISCV::VMSOF_M:
- case RISCV::VIOTA_M:
+ // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+ case RISCV::VMADC_VV:
+ case RISCV::VMADC_VX:
+ case RISCV::VMADC_VI:
+ case RISCV::VMADC_VVM:
+ case RISCV::VMADC_VXM:
+ case RISCV::VMADC_VIM:
+ case RISCV::VMSBC_VV:
+ case RISCV::VMSBC_VX:
+ case RISCV::VMSBC_VVM:
+ case RISCV::VMSBC_VXM:
// Vector Integer Compare Instructions
case RISCV::VMSEQ_VV:
case RISCV::VMSEQ_VX:
>From e7be1061576ab51e551d34a709cea09071102928 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Wed, 30 Oct 2024 18:37:53 +0800
Subject: [PATCH 3/4] Add an option to disable this mutation
---
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b88bd18e7c8585..236e2ee1deaadd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -109,6 +109,11 @@ static cl::opt<bool>
cl::desc("Enable the RISC-V VL Optimizer pass"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> EnableVectorMaskMutation(
+ "riscv-enable-vector-mask-mutation",
+ cl::desc("Enable the vector mask scheduling mutation"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -362,7 +367,7 @@ class RISCVPassConfig : public TargetPassConfig {
}
const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
- if (ST.hasVInstructions()) {
+ if (EnableVectorMaskMutation && ST.hasVInstructions()) {
DAG = DAG ? DAG : createGenericSchedLive(C);
DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI));
}
>From f21427c28070effd320d5f65b1cfd60d94772923 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Thu, 31 Oct 2024 18:18:05 +0800
Subject: [PATCH 4/4] Skip LMUL=8 cases as a workaround to avoid spills
---
.../RISCV/RISCVVectorMaskDAGMutation.cpp | 9 +-
.../RISCV/rvv/fixed-vectors-fmaximum-vp.ll | 101 ++---
.../RISCV/rvv/fixed-vectors-fmaximum.ll | 3 +-
.../RISCV/rvv/fixed-vectors-fminimum-vp.ll | 101 ++---
.../RISCV/rvv/fixed-vectors-fminimum.ll | 3 +-
.../RISCV/rvv/fixed-vectors-reduction-fp.ll | 404 +++++++++++-------
.../RISCV/rvv/fixed-vectors-rint-vp.ll | 15 +-
.../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll | 355 +++++----------
llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 280 ++++++------
.../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll | 355 +++++----------
llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 280 ++++++------
11 files changed, 873 insertions(+), 1033 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
index 3ce289cf6d24cd..e6f3d8fe8cdd6a 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -12,11 +12,14 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/RISCVBaseInfo.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "RISCVRegisterInfo.h"
#include "RISCVTargetMachine.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/TargetParser/RISCVTargetParser.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -98,7 +101,11 @@ class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
if (MI->findRegisterUseOperand(RISCV::V0, TRI))
NearestUseV0SU = &SU;
- if (NearestUseV0SU && NearestUseV0SU != &SU && isVectorMaskProducer(MI))
+ if (NearestUseV0SU && NearestUseV0SU != &SU && isVectorMaskProducer(MI) &&
+ // For LMUL=8 cases, there will be more possibilities to spill.
+ // FIXME: We should use RegPressureTracker to do fine-grained
+ // controls.
+ RISCVII::getLMul(MI->getDesc().TSFlags) != RISCVII::LMUL_8)
DAG->addEdge(&SU, SDep(NearestUseV0SU, SDep::Artificial));
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 216300b23f4524..53c72f87832216 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -572,8 +572,9 @@ define <16 x double> @vfmax_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -694,25 +695,21 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: li a3, 24
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
@@ -721,66 +718,52 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a2, -16
-; CHECK-NEXT: sltu a2, a2, a1
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a1, a2, a1
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 5
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, a2, -16
+; CHECK-NEXT: sltu a1, a2, a0
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmax.vv v24, v16, v24
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vfmax.vv v16, v16, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v16, v8
-; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
+; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 5b4959409d63c4..76610af7180d67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -259,8 +259,9 @@ define <16 x double> @vfmax_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index b11a48c2857e3c..2ca5ec19f7fe80 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -572,8 +572,9 @@ define <16 x double> @vfmin_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -694,25 +695,21 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 40
+; CHECK-NEXT: li a3, 24
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
@@ -721,66 +718,52 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a2, -16
-; CHECK-NEXT: sltu a2, a2, a1
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a1, a2, a1
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 5
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, a2, -16
+; CHECK-NEXT: sltu a1, a2, a0
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmin.vv v24, v16, v24
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vfmin.vv v16, v16, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v16, v8
-; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
+; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index a9d3c99b8cde42..92e31cf1ecd823 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -259,8 +259,9 @@ define <16 x double> @vfmin_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 0f585887ac8312..4be680e272e5b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1948,26 +1948,41 @@ declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>)
define float @vreduce_fminimum_v64f32(ptr %x) {
; CHECK-LABEL: vreduce_fminimum_v64f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB119_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
-; CHECK-NEXT: ret
+; CHECK-NEXT: j .LBB119_3
; CHECK-NEXT: .LBB119_2:
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: .LBB119_3:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <64 x float>, ptr %x
%red = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %v)
@@ -2008,61 +2023,73 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: addi a2, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v24, (a2)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vle32.v v16, (a2)
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vle32.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle32.v v24, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v24, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB121_2
@@ -2259,25 +2286,40 @@ declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)
define double @vreduce_fminimum_v32f64(ptr %x) {
; CHECK-LABEL: vreduce_fminimum_v32f64:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB131_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, %hi(.LCPI131_0)
; CHECK-NEXT: fld fa0, %lo(.LCPI131_0)(a0)
-; CHECK-NEXT: ret
+; CHECK-NEXT: j .LBB131_3
; CHECK-NEXT: .LBB131_2:
; CHECK-NEXT: vfredmin.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: .LBB131_3:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <32 x double>, ptr %x
%red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v)
@@ -2316,61 +2358,73 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 384
+; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v24, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB133_2
@@ -2647,26 +2701,41 @@ declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>)
define float @vreduce_fmaximum_v64f32(ptr %x) {
; CHECK-LABEL: vreduce_fmaximum_v64f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vle32.v v24, (a1)
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB147_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
-; CHECK-NEXT: ret
+; CHECK-NEXT: j .LBB147_3
; CHECK-NEXT: .LBB147_2:
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: .LBB147_3:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <64 x float>, ptr %x
%red = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %v)
@@ -2707,61 +2776,73 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: addi a2, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v24, (a2)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vle32.v v16, (a2)
; CHECK-NEXT: addi a1, a0, 384
-; CHECK-NEXT: vle32.v v16, (a1)
+; CHECK-NEXT: vle32.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle32.v v24, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v24, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB149_2
@@ -2958,25 +3039,40 @@ declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
define double @vreduce_fmaximum_v32f64(ptr %x) {
; CHECK-LABEL: vreduce_fmaximum_v32f64:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle64.v v16, (a0)
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB159_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, %hi(.LCPI159_0)
; CHECK-NEXT: fld fa0, %lo(.LCPI159_0)(a0)
-; CHECK-NEXT: ret
+; CHECK-NEXT: j .LBB159_3
; CHECK-NEXT: .LBB159_2:
; CHECK-NEXT: vfredmax.vs v8, v8, v8
; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: .LBB159_3:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = load <32 x double>, ptr %x
%red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v)
@@ -3015,61 +3111,73 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a1, a0, 384
; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a1, a0, 384
+; CHECK-NEXT: vle64.v v8, (a1)
; CHECK-NEXT: addi a1, a0, 256
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v24, v24, v8
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v24, v16
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: vmfne.vv v16, v8, v8
; CHECK-NEXT: vcpop.m a0, v16
; CHECK-NEXT: beqz a0, .LBB161_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 9d2bd4974a0b83..1f856d04ca89fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -586,17 +586,20 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: addi a2, a0, -16
+; CHECK-NEXT: sltu a0, a0, a2
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v7, v24, fa5
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfabs.v v24, v16
-; CHECK-NEXT: vmflt.vf v0, v24, fa5
; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index 2f92407b1965ba..5c6c782542d4fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -113,19 +113,31 @@ declare <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat>, <
define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
; CHECK-LABEL: vfmax_nxv16bf16_vv:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: vfmax.vv v16, v16, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
ret <vscale x 16 x bfloat> %v
@@ -134,177 +146,55 @@ define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
declare <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; ZVFH-LABEL: vfmax_nxv32bf16_vv:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 5
-; ZVFH-NEXT: sub sp, sp, a0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vmv8r.v v0, v8
-; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v0, v24, v24
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmerge.vvm v8, v8, v24, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v0, v16, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v28
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vfmax.vv v24, v16, v24
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vmerge.vvm v8, v8, v16, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vfmax.vv v16, v8, v16
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 5
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v0, v8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v28
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v24, v16, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: ret
+; CHECK-LABEL: vfmax_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v0, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v3, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfmax.vv v16, v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
ret <vscale x 32 x bfloat> %v
}
@@ -456,19 +346,31 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
;
; ZVFHMIN-LABEL: vfmax_nxv16f16_vv:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8
+; ZVFHMIN-NEXT: vmfeq.vv v7, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.maximum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
ret <vscale x 16 x half> %v
@@ -481,8 +383,9 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmfeq.vv v0, v16, v16
+; ZVFH-NEXT: vmv1r.v v0, v7
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmax.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -491,82 +394,48 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv8r.v v0, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
+; ZVFHMIN-NEXT: vmfeq.vv v3, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT: vfmax.vv v8, v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v7, v8, v8
; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v24, v16, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFHMIN-NEXT: vfmax.vv v16, v8, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -645,8 +514,9 @@ define <vscale x 16 x float> @vfmax_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -709,8 +579,9 @@ define <vscale x 8 x double> @vfmax_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 3c76bb6db601b3..648fc143981c5f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -250,6 +250,12 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -257,12 +263,20 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: vfmax.vv v16, v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v8, v16
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.maximum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x bfloat> %v
@@ -423,10 +437,9 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: sub a3, a0, a1
@@ -439,33 +452,45 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v24, a2
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v6, v24, v24, v0.t
-; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv4r.v v8, v16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a2, a2, a4
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vfmax.vv v16, v16, v8, v0.t
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -475,7 +500,7 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB11_2:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -483,33 +508,24 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v16, v16, v8
+; CHECK-NEXT: vmfeq.vv v3, v16, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v16, v16, v24
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -871,6 +887,12 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
;
; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -878,12 +900,20 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.vp.maximum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x half> %v
@@ -1071,8 +1101,9 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmfeq.vv v0, v16, v16
+; ZVFH-NEXT: vmv1r.v v0, v7
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmax.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -1082,10 +1113,9 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: sub a3, a0, a1
@@ -1098,33 +1128,45 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v24, a2
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v6, v24, v24, v0.t
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
+; ZVFHMIN-NEXT: li a4, 24
+; ZVFHMIN-NEXT: mul a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT: vmv1r.v v0, v12
; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmfeq.vv v12, v16, v16, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
; ZVFHMIN-NEXT: vmv1r.v v0, v12
-; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -1134,7 +1176,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: mv a0, a1
; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1142,33 +1184,24 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: li a2, 24
+; ZVFHMIN-NEXT: mul a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8
+; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v16
; ZVFHMIN-NEXT: vmv8r.v v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -1454,8 +1487,9 @@ define <vscale x 8 x double> @vfmax_vv_nxv8f64_unmasked(<vscale x 8 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -1644,92 +1678,64 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: vmv8r.v v24, v16
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v16, (a3)
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a3, a3, a4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: sub a3, a2, a1
; CHECK-NEXT: sltu a4, a2, a3
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmv8r.v v16, v24
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a3, a3, a4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a3, a3, a4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vl8re64.v v16, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vfmax.vv v8, v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a2, a1, .LBB41_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmax.vv v24, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a0, a0, a3
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB41_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB41_2:
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index c0707d267f5af0..185a7a8977b79b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -113,19 +113,31 @@ declare <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat>, <
define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
; CHECK-LABEL: vfmin_nxv16bf16_vv:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: vfmin.vv v16, v16, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
ret <vscale x 16 x bfloat> %v
@@ -134,177 +146,55 @@ define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
declare <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; ZVFH-LABEL: vfmin_nxv32bf16_vv:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 5
-; ZVFH-NEXT: sub sp, sp, a0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vmv8r.v v0, v8
-; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v0, v24, v24
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmerge.vvm v8, v8, v24, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v0, v16, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v28
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: mv a1, a0
-; ZVFH-NEXT: slli a0, a0, 1
-; ZVFH-NEXT: add a0, a0, a1
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vfmin.vv v24, v16, v24
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vmerge.vvm v8, v8, v16, v0
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT: vfmin.vv v16, v8, v16
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 5
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v0, v8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v28
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v24, v16, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: ret
+; CHECK-LABEL: vfmin_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v0, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v3, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfmin.vv v16, v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%v = call <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
ret <vscale x 32 x bfloat> %v
}
@@ -456,19 +346,31 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
;
; ZVFHMIN-LABEL: vfmin_nxv16f16_vv:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8
+; ZVFHMIN-NEXT: vmfeq.vv v7, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.minimum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
ret <vscale x 16 x half> %v
@@ -481,8 +383,9 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmfeq.vv v0, v16, v16
+; ZVFH-NEXT: vmv1r.v v0, v7
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmin.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -491,82 +394,48 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv8r.v v0, v8
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v24, v0
+; ZVFHMIN-NEXT: vmfeq.vv v3, v24, v24
+; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT: vfmin.vv v8, v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v7, v8, v8
; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v24, v16, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v16, v0
+; ZVFHMIN-NEXT: vfmin.vv v16, v8, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -645,8 +514,9 @@ define <vscale x 16 x float> @vfmin_nxv16f32_vv(<vscale x 16 x float> %a, <vscal
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -709,8 +579,9 @@ define <vscale x 8 x double> @vfmin_nxv8f64_vv(<vscale x 8 x double> %a, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 9750a9b5477c92..c81e667b189a6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -250,6 +250,12 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked:
; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -257,12 +263,20 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: vfmin.vv v16, v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v8, v16
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.minimum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x bfloat> %v
@@ -423,10 +437,9 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: slli a1, a1, 5
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: sub a3, a0, a1
@@ -439,33 +452,45 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v24, a2
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v6, v24, v24, v0.t
-; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv4r.v v8, v16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a2, a2, a4
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vfmin.vv v16, v16, v8, v0.t
; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -475,7 +500,7 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: mv a0, a1
; CHECK-NEXT: .LBB11_2:
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -483,33 +508,24 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v16, v16, v8
+; CHECK-NEXT: vmfeq.vv v3, v16, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v16, v16, v24
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -871,6 +887,12 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
;
; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -878,12 +900,20 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24
; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
%v = call <vscale x 16 x half> @llvm.vp.minimum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret <vscale x 16 x half> %v
@@ -1071,8 +1101,9 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmfeq.vv v7, v16, v16
; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0
-; ZVFH-NEXT: vmfeq.vv v0, v16, v16
+; ZVFH-NEXT: vmv1r.v v0, v7
; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0
; ZVFH-NEXT: vfmin.vv v8, v8, v24
; ZVFH-NEXT: ret
@@ -1082,10 +1113,9 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: sub a3, a0, a1
@@ -1098,33 +1128,45 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v24, a2
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v6, v24, v24, v0.t
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
+; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
+; ZVFHMIN-NEXT: li a4, 24
+; ZVFHMIN-NEXT: mul a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT: vmv1r.v v0, v12
; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vmfeq.vv v12, v16, v16, v0.t
+; ZVFHMIN-NEXT: vmfeq.vv v12, v24, v24, v0.t
; ZVFHMIN-NEXT: vmv1r.v v0, v12
-; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v16, v0
; ZVFHMIN-NEXT: vmv1r.v v0, v7
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
@@ -1134,7 +1176,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: mv a0, a1
; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1142,33 +1184,24 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: li a2, 24
+; ZVFHMIN-NEXT: mul a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vmfeq.vv v0, v24, v24
-; ZVFHMIN-NEXT: vmerge.vvm v16, v24, v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8
+; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v16
; ZVFHMIN-NEXT: vmv8r.v v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 24
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -1454,8 +1487,9 @@ define <vscale x 8 x double> @vfmin_vv_nxv8f64_unmasked(<vscale x 8 x double> %v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -1644,92 +1678,64 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: vmv8r.v v24, v16
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vl8re64.v v16, (a3)
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a3, a3, a4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: sub a3, a2, a1
; CHECK-NEXT: sltu a4, a2, a3
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vmv8r.v v16, v24
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a3, a3, a4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a3, a3, a4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v0, v24, v24
-; CHECK-NEXT: vl8re64.v v16, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vfmin.vv v8, v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a2, a1, .LBB41_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfmin.vv v24, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a0, a0, a3
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB41_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB41_2:
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list