[llvm] [RISCV] Promote bf16 ops to f32 with zvfbfmin (PR #108937)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 17 00:35:55 PDT 2024
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/108937
For f16 with zvfhmin, we promote most ops and VP ops to f32. This does the same for bf16 with zvfbfmin, so the two fp types should now be in sync.
There are a few places in the custom lowering where we need to check for a LMUL 8 f16/bf16 vector that can't be promoted and must be split, this extracts that out into isPromotedOpNeedingSplit.
In a follow up NFC we can deduplicate the code that sets up the promotions.
Stacked upon #108765
>From d911e98ff2ab3709d53d826ecec81e5be216e66f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sun, 15 Sep 2024 00:55:43 +0800
Subject: [PATCH 1/3] [RISCV] Split fp rounding ops with zvfhmin nxv32f16
This adds zvfhmin test coverage for fceil, ffloor, fnearbyint, frint, fround and froundeven and splits them at nxv32f16 to avoid crashing, similarly to what we do for other nodes that we promote.
This also sets ftrunc to promote which was previously missing. We already promote the VP version of it, vp_froundtozero.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 17 +-
llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll | 304 ++++++++++++-----
llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll | 304 ++++++++++++-----
.../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll | 319 +++++++++++++-----
llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll | 268 +++++++++++----
llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll | 304 ++++++++++++-----
.../CodeGen/RISCV/rvv/froundeven-sdnode.ll | 304 ++++++++++++-----
llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll | 268 +++++++++++----
8 files changed, 1504 insertions(+), 584 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index eb8ea95e2d8583..d88d400a20de80 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -938,12 +938,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// TODO: support more ops.
static const unsigned ZvfhminPromoteOps[] = {
- ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB,
- ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT,
- ISD::FCEIL, ISD::FFLOOR, ISD::FROUND, ISD::FROUNDEVEN,
- ISD::FRINT, ISD::FNEARBYINT, ISD::IS_FPCLASS, ISD::SETCC,
- ISD::FMAXIMUM, ISD::FMINIMUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
- ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA};
+ ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB,
+ ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT,
+ ISD::FCEIL, ISD::FTRUNC, ISD::FFLOOR, ISD::FROUND,
+ ISD::FROUNDEVEN, ISD::FRINT, ISD::FNEARBYINT, ISD::IS_FPCLASS,
+ ISD::SETCC, ISD::FMAXIMUM, ISD::FMINIMUM, ISD::STRICT_FADD,
+ ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FSQRT,
+ ISD::STRICT_FMA};
// TODO: support more vp ops.
static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD,
@@ -6925,6 +6926,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::FRINT:
case ISD::FROUND:
case ISD::FROUNDEVEN:
+ if (Op.getValueType() == MVT::nxv32f16 &&
+ (Subtarget.hasVInstructionsF16Minimal() &&
+ !Subtarget.hasVInstructionsF16()))
+ return SplitVectorOp(Op, DAG);
return lowerFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
case ISD::LRINT:
case ISD::LLRINT:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
index 9efc3183f15a52..111d1d8e07d3bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
@@ -1,124 +1,256 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: ceil_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: ceil_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 3
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: ceil_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: ceil_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: ceil_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 3
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: ceil_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.ceil.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.ceil.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: ceil_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: ceil_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 3
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: ceil_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: ceil_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: ceil_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: fsrmi a0, 3
+; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: ceil_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.ceil.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.ceil.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: ceil_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: ceil_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: fsrmi a0, 3
+; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: ceil_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.ceil.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.ceil.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: ceil_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: ceil_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: fsrmi a0, 3
+; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: ceil_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 3
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.ceil.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
index ec60b3ed3e0c88..97d84e91744038 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
@@ -1,124 +1,256 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: floor_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 2
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: floor_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 2
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: floor_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.floor.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.floor.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: floor_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 2
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: floor_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 2
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: floor_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.floor.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.floor.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: floor_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 2
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: floor_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 2
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: floor_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.floor.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.floor.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: floor_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: fsrmi a0, 2
-; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: floor_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: fsrmi a0, 2
+; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: floor_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.floor.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.floor.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: floor_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: fsrmi a0, 2
-; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: floor_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: fsrmi a0, 2
+; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: floor_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.floor.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.floor.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: floor_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: fsrmi a0, 2
-; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: floor_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: fsrmi a0, 2
+; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: floor_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 2
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.floor.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
index 9e14852305caa1..0655b9d099cbb7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
@@ -1,124 +1,271 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: nearbyint_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: frflags a0
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: fsflags a0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: nearbyint_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: frflags a0
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: fsflags a0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: nearbyint_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: nearbyint_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: frflags a0
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: fsflags a0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: nearbyint_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: frflags a0
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: fsflags a0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: nearbyint_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: nearbyint_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: frflags a0
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: fsflags a0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: nearbyint_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: frflags a0
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: fsflags a0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: nearbyint_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: nearbyint_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: frflags a0
-; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: fsflags a0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: nearbyint_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: frflags a0
+; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: fsflags a0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: nearbyint_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: nearbyint_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: frflags a0
-; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: fsflags a0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: nearbyint_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: frflags a0
+; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: fsflags a0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: nearbyint_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: nearbyint_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: frflags a0
-; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: fsflags a0
-; CHECK-NEXT: ret
+; ZVFH-LABEL: nearbyint_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: frflags a0
+; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: fsflags a0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: nearbyint_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v24
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: frflags a0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT: addi a1, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT: fsflags a0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.nearbyint.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
index fb77b746549400..ca1f72ee4d524b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
@@ -1,112 +1,232 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
define <vscale x 1 x half> @rint_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: rint_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: rint_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: rint_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @rint_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: rint_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: rint_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: rint_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.rint.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.rint.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @rint_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: rint_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: rint_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: rint_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @rint_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: rint_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: rint_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: rint_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.rint.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.rint.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @rint_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: rint_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: rint_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: rint_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.rint.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.rint.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @rint_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: rint_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: rint_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: rint_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v24
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.rint.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
index bb6724eeb32006..a39abcc6ed0e27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
@@ -1,126 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
; This file tests the code generation for `llvm.round.*` on scalable vector type.
define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: round_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 4
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: round_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 4
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: round_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.round.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.round.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: round_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 4
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: round_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 4
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: round_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.round.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.round.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: round_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 4
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: round_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 4
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: round_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.round.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.round.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: round_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: fsrmi a0, 4
-; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: round_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: fsrmi a0, 4
+; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: round_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.round.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.round.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: round_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: fsrmi a0, 4
-; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: round_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: fsrmi a0, 4
+; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: round_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.round.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.round.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: round_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: fsrmi a0, 4
-; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: round_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: fsrmi a0, 4
+; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: round_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 4
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.round.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
index 6f5207a25518f5..52ad443bfdebda 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
@@ -1,126 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
; This file tests the code generation for `llvm.roundeven.*` on scalable vector type.
define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: roundeven_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 0
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: roundeven_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 0
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: roundeven_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.roundeven.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.roundeven.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: roundeven_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 0
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: roundeven_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 0
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: roundeven_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.roundeven.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.roundeven.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: roundeven_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: fsrmi a0, 0
-; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: roundeven_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: fsrmi a0, 0
+; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: roundeven_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.roundeven.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.roundeven.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: roundeven_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: fsrmi a0, 0
-; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: roundeven_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: fsrmi a0, 0
+; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: roundeven_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.roundeven.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.roundeven.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: roundeven_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: fsrmi a0, 0
-; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: roundeven_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: fsrmi a0, 0
+; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: roundeven_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.roundeven.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.roundeven.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: roundeven_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: fsrmi a0, 0
-; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: roundeven_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: fsrmi a0, 0
+; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: fsrm a0
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: roundeven_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: fsrmi a0, 0
+; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: fsrm a0
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.roundeven.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
index 8841232e7f76df..971424e8cea09e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
@@ -1,112 +1,232 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) {
-; CHECK-LABEL: trunc_nxv1f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: trunc_nxv1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: trunc_nxv1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 1 x half> @llvm.trunc.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
}
declare <vscale x 1 x half> @llvm.trunc.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: trunc_nxv2f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: trunc_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: trunc_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 2 x half> @llvm.trunc.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
}
declare <vscale x 2 x half> @llvm.trunc.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) {
-; CHECK-LABEL: trunc_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfabs.v v9, v8
-; CHECK-NEXT: vmflt.vf v0, v9, fa5
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: trunc_nxv4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT: vfabs.v v9, v8
+; ZVFH-NEXT: vmflt.vf v0, v9, fa5
+; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: trunc_nxv4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v10
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
}
declare <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) {
-; CHECK-LABEL: trunc_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, fa5
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: trunc_nxv8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT: vfabs.v v10, v8
+; ZVFH-NEXT: vmflt.vf v0, v10, fa5
+; ZVFH-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: trunc_nxv8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v12
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 8 x half> @llvm.trunc.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
}
declare <vscale x 8 x half> @llvm.trunc.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) {
-; CHECK-LABEL: trunc_nxv16f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, fa5
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: trunc_nxv16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfabs.v v12, v8
+; ZVFH-NEXT: vmflt.vf v0, v12, fa5
+; ZVFH-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: trunc_nxv16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 16 x half> @llvm.trunc.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
}
declare <vscale x 16 x half> @llvm.trunc.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) {
-; CHECK-LABEL: trunc_nxv32f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, fa5
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT: ret
+; ZVFH-LABEL: trunc_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vfabs.v v16, v8
+; ZVFH-NEXT: vmflt.vf v0, v16, fa5
+; ZVFH-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: trunc_nxv32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v24, v16
+; ZVFHMIN-NEXT: lui a0, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a0
+; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v24
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT: ret
%a = call <vscale x 32 x half> @llvm.trunc.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
}
>From 5d63915b081dd5a826ac500f1104e14e1f32c2a8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 16 Sep 2024 13:42:37 +0800
Subject: [PATCH 2/3] Update fixed length vector tests
Marking ftrunc as promoted means these are no longer expanded
---
llvm/test/Analysis/CostModel/RISCV/fround.ll | 16 +-
.../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 943 +-----------------
2 files changed, 45 insertions(+), 914 deletions(-)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fround.ll b/llvm/test/Analysis/CostModel/RISCV/fround.ll
index dc501b82417d3d..b4740f223eca3a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fround.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fround.ll
@@ -233,10 +233,10 @@ define void @trunc_fp16() {
;
; ZVFHMIN-LABEL: 'trunc_fp16'
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %1 = call half @llvm.trunc.f16(half undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %2 = call <2 x half> @llvm.trunc.v2f16(<2 x half> undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %3 = call <4 x half> @llvm.trunc.v4f16(<4 x half> undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %4 = call <8 x half> @llvm.trunc.v8f16(<8 x half> undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %5 = call <16 x half> @llvm.trunc.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.trunc.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.trunc.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.trunc.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.trunc.v16f16(<16 x half> undef)
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.trunc.nxv1f16(<vscale x 1 x half> undef)
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.trunc.nxv2f16(<vscale x 2 x half> undef)
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half> undef)
@@ -1108,10 +1108,10 @@ define void @vp_roundtozero_f16() {
; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; ZVFHMIN-LABEL: 'vp_roundtozero_f16'
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %1 = call <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %2 = call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %3 = call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %4 = call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <vscale x 1 x half> @llvm.vp.roundtozero.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.vp.roundtozero.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.vp.roundtozero.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 34a45c57441789..5ab8eab091c2e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -5112,457 +5112,24 @@ define void @trunc_v8f16(ptr %x) {
; ZVFH-NEXT: vse16.v v8, (a0)
; ZVFH-NEXT: ret
;
-; ZVFHMIN-ZFH-RV32-LABEL: trunc_v8f16:
-; ZVFHMIN-ZFH-RV32: # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI115_0)
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, %lo(.LCPI115_0)(a1)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_2
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.1:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_2:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_4
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.3:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa3, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_4:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_6
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.5:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa3, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_6:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB115_8
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_8:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_10
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_10:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_12
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a3, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_12:
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB115_14
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_14:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB115_16
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB115_16:
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT: ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: trunc_v8f16:
-; ZVFHMIN-ZFH-RV64: # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI115_0)
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, %lo(.LCPI115_0)(a1)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_2
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.1:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_2:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_4
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.3:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa3, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_4:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_6
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.5:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa3, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_6:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa0, fa3
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa0, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB115_8
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.7:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa3, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa1, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa3, fa1, fa3
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_8:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_10
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.9:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a2, fa1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a2, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa4, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_10:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa2, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_12
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.11:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a3, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa2, a3, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa2, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_12:
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB115_14
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.13:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa4, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_14:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a2, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a2, .LBB115_16
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.15:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa5, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB115_16:
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT: ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v8f16:
-; ZVFHMIN-ZFHIN-RV32: # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 307200
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.w.x fa5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_2
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.1:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_2:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_4
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.3:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa3, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_4:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_6
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.5:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_6:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa0, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB115_8
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_8:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB115_10
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_10:
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB115_12
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a3, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_12:
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB115_14
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_14:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB115_16
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB115_16:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT: ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v8f16:
-; ZVFHMIN-ZFHIN-RV64: # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 307200
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.w.x fa5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_2
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.1:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_2:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_4
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.3:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa3, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_4:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa3, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_6
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.5:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa3, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa3, fa2, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_6:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa0, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa0
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa0, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB115_8
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.7:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa1, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_8:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB115_10
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.9:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_10:
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB115_12
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.11:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a3, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a3, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa2, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_12:
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB115_14
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.13:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_14:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB115_16
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.15:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa5, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa5, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB115_16:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT: ret
+; ZVFHMIN-LABEL: trunc_v8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a0)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a1, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a1
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: vse16.v v8, (a0)
+; ZVFHMIN-NEXT: ret
%a = load <8 x half>, ptr %x
%b = call <8 x half> @llvm.trunc.v8f16(<8 x half> %a)
store <8 x half> %b, ptr %x
@@ -5587,461 +5154,25 @@ define void @trunc_v6f16(ptr %x) {
; ZVFH-NEXT: vse16.v v8, (a0)
; ZVFH-NEXT: ret
;
-; ZVFHMIN-ZFH-RV32-LABEL: trunc_v6f16:
-; ZVFHMIN-ZFH-RV32: # %bb.0:
-; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp
-; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: lui a1, %hi(.LCPI116_0)
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, %lo(.LCPI116_0)(a1)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_2
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.1:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_2:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 0(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_4
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.3:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa3, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_4:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_6
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.5:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa3, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_6:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 6(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa0, fa3
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a1, fa0, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a1, .LBB116_8
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.7:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa3, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa1, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa3, fa1, fa3
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_8:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_10
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.9:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a2, fa1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa1, fa4, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_10:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa2, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_12
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.11:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a3, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa2, a3, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa2, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_12:
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 12(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa3
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa4, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a3, .LBB116_14
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.13:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa4, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa2, fa4, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_14:
-; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFH-RV32-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: flt.h a2, fa3, fa5
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT: beqz a2, .LBB116_16
-; ZVFHMIN-ZFH-RV32-NEXT: # %bb.15:
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fcvt.h.w fa5, a1, rtz
-; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa5, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: .LBB116_16:
-; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFH-RV32-NEXT: ret
-;
-; ZVFHMIN-ZFH-RV64-LABEL: trunc_v6f16:
-; ZVFHMIN-ZFH-RV64: # %bb.0:
-; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp
-; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: lui a1, %hi(.LCPI116_0)
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, %lo(.LCPI116_0)(a1)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_2
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.1:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_2:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 0(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_4
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.3:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa3, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_4:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_6
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.5:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa3, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa3, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_6:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 6(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa0, fa3
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a1, fa0, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a1, .LBB116_8
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.7:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa3, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa1, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa3, fa1, fa3
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_8:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_10
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.9:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a2, fa1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a2, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa1, fa4, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_10:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa2, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa2, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_12
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.11:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a3, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa2, a3, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa2, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_12:
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 12(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa3
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa4, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a3, fa4, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a3, .LBB116_14
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.13:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa2, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa4, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa2, fa4, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_14:
-; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp)
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a2
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFH-RV64-NEXT: fabs.h fa3, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: flt.h a2, fa3, fa5
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT: beqz a2, .LBB116_16
-; ZVFHMIN-ZFH-RV64-NEXT: # %bb.15:
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.w.h a1, fa4, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fcvt.h.w fa5, a1, rtz
-; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa5, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: .LBB116_16:
-; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFH-RV64-NEXT: ret
-;
-; ZVFHMIN-ZFHIN-RV32-LABEL: trunc_v6f16:
-; ZVFHMIN-ZFHIN-RV32: # %bb.0:
-; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp
-; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 307200
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.w.x fa5, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_2
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.1:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_2:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_4
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.3:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa3, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_4:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_6
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.5:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa3, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa3, fa2, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_6:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa0, 6(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa0
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa0, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a1, fa0, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a1, .LBB116_8
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.7:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_8:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa2, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_10
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.9:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa1, a2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa2, fa1, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_10:
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa1, 8(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_12
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.11:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a3, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa2, a3, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa2, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_12:
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 12(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a3, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a3, .LBB116_14
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.13:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a2, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa3, a2, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_14:
-; ZVFHMIN-ZFHIN-RV32-NEXT: flh fa3, 14(sp)
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV32-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: flt.s a2, fa3, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: beqz a2, .LBB116_16
-; ZVFHMIN-ZFHIN-RV32-NEXT: # %bb.15:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.s.w fa5, a1, rtz
-; ZVFHMIN-ZFHIN-RV32-NEXT: fsgnj.s fa4, fa5, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: .LBB116_16:
-; ZVFHMIN-ZFHIN-RV32-NEXT: fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV32-NEXT: ret
-;
-; ZVFHMIN-ZFHIN-RV64-LABEL: trunc_v6f16:
-; ZVFHMIN-ZFHIN-RV64: # %bb.0:
-; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16
-; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp
-; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
-; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1)
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa5, 2(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 307200
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.w.x fa5, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_2
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.1:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_2:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 0(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_4
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.3:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa3, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_4:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 4(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa3, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_6
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.5:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa3, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa3, fa2, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_6:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa0, 6(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa0
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa0, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a1, fa0, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa1
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a1, .LBB116_8
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.7:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa1, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_8:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 10(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa2, fa1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB116_10
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.9:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa1, a2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa2, fa1, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_10:
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa1, 8(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa2, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa1, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a2, fa2
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB116_12
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.11:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a3, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa2, a3, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa2, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_12:
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 12(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a3, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a3, .LBB116_14
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.13:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a2, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa3, a2, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_14:
-; ZVFHMIN-ZFHIN-RV64-NEXT: flh fa3, 14(sp)
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa4, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.h fa4, fa3
-; ZVFHMIN-ZFHIN-RV64-NEXT: fabs.s fa3, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: flt.s a2, fa3, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: beqz a2, .LBB116_16
-; ZVFHMIN-ZFHIN-RV64-NEXT: # %bb.15:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.w.s a1, fa4, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.s.w fa5, a1, rtz
-; ZVFHMIN-ZFHIN-RV64-NEXT: fsgnj.s fa4, fa5, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: .LBB116_16:
-; ZVFHMIN-ZFHIN-RV64-NEXT: fcvt.h.s fa5, fa4
-; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa5
-; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1
-; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu
-; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0)
-; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16
-; ZVFHMIN-ZFHIN-RV64-NEXT: ret
+; ZVFHMIN-LABEL: trunc_v6f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a0)
+; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vfabs.v v8, v9
+; ZVFHMIN-NEXT: lui a1, 307200
+; ZVFHMIN-NEXT: fmv.w.x fa5, a1
+; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: vse16.v v8, (a0)
+; ZVFHMIN-NEXT: ret
%a = load <6 x half>, ptr %x
%b = call <6 x half> @llvm.trunc.v6f16(<6 x half> %a)
store <6 x half> %b, ptr %x
>From d091eb11d9d841d366b0bb2e09fa75cf959a110c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 17 Sep 2024 15:24:28 +0800
Subject: [PATCH 3/3] [RISCV] Promote bf16 ops to f32 with zvfbfmin
For f16 with zvfhmin, we promote most ops and VP ops to f32. This does the same for bf16 with zvfbfmin, so the two fp types should now be in sync.
There are a few places in the custom lowering where we need to check for a LMUL 8 f16/bf16 vector that can't be promoted and must be split, this extracts that out into isPromotedOpNeedingSplit.
In a follow up NFC we can deduplicate the code that sets up the promotions.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 145 +-
llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll | 536 +++-
llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll | 208 +-
llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll | 214 +-
llvm/test/CodeGen/RISCV/rvv/floor-vp.ll | 536 +++-
.../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll | 195 +-
llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 562 +++-
.../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll | 195 +-
llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 562 +++-
.../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll | 223 +-
llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll | 196 +-
llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll | 208 +-
.../CodeGen/RISCV/rvv/froundeven-sdnode.ll | 207 +-
llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll | 196 +-
llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll | 530 +++-
llvm/test/CodeGen/RISCV/rvv/rint-vp.ll | 507 +++-
llvm/test/CodeGen/RISCV/rvv/round-vp.ll | 538 +++-
llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll | 538 +++-
llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll | 538 +++-
llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll | 1913 +++++++++++--
llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll | 1162 +++++++-
.../RISCV/rvv/vfadd-constrained-sdnode.ll | 243 +-
llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll | 256 +-
llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll | 681 ++++-
llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll | 55 +-
.../RISCV/rvv/vfdiv-constrained-sdnode.ll | 262 +-
llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll | 244 +-
llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll | 643 ++++-
llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 2449 ++++++++++++++---
.../RISCV/rvv/vfmadd-constrained-sdnode.ll | 402 ++-
llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll | 565 +++-
llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll | 247 +-
llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll | 291 +-
llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll | 247 +-
llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll | 291 +-
.../RISCV/rvv/vfmul-constrained-sdnode.ll | 243 +-
llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll | 256 +-
.../RISCV/rvv/vfsqrt-constrained-sdnode.ll | 114 +-
llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll | 109 +-
llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll | 257 +-
.../RISCV/rvv/vfsub-constrained-sdnode.ll | 262 +-
llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll | 256 +-
llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll | 643 ++++-
.../CodeGen/RISCV/rvv/vreductions-fp-vp.ll | 340 ++-
44 files changed, 17683 insertions(+), 1582 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d88d400a20de80..efc005dff325a8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -937,7 +937,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
};
// TODO: support more ops.
- static const unsigned ZvfhminPromoteOps[] = {
+ static const unsigned ZvfhminZvfbfminPromoteOps[] = {
ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, ISD::FSUB,
ISD::FMUL, ISD::FMA, ISD::FDIV, ISD::FSQRT,
ISD::FCEIL, ISD::FTRUNC, ISD::FFLOOR, ISD::FROUND,
@@ -947,30 +947,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::STRICT_FMA};
// TODO: support more vp ops.
- static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD,
- ISD::VP_FSUB,
- ISD::VP_FMUL,
- ISD::VP_FDIV,
- ISD::VP_FMA,
- ISD::VP_REDUCE_FADD,
- ISD::VP_REDUCE_SEQ_FADD,
- ISD::VP_REDUCE_FMIN,
- ISD::VP_REDUCE_FMAX,
- ISD::VP_SQRT,
- ISD::VP_FMINNUM,
- ISD::VP_FMAXNUM,
- ISD::VP_FCEIL,
- ISD::VP_FFLOOR,
- ISD::VP_FROUND,
- ISD::VP_FROUNDEVEN,
- ISD::VP_FROUNDTOZERO,
- ISD::VP_FRINT,
- ISD::VP_FNEARBYINT,
- ISD::VP_SETCC,
- ISD::VP_FMINIMUM,
- ISD::VP_FMAXIMUM,
- ISD::VP_REDUCE_FMINIMUM,
- ISD::VP_REDUCE_FMAXIMUM};
+ static const unsigned ZvfhminZvfbfminPromoteVPOps[] = {
+ ISD::VP_FADD,
+ ISD::VP_FSUB,
+ ISD::VP_FMUL,
+ ISD::VP_FDIV,
+ ISD::VP_FMA,
+ ISD::VP_REDUCE_FADD,
+ ISD::VP_REDUCE_SEQ_FADD,
+ ISD::VP_REDUCE_FMIN,
+ ISD::VP_REDUCE_FMAX,
+ ISD::VP_SQRT,
+ ISD::VP_FMINNUM,
+ ISD::VP_FMAXNUM,
+ ISD::VP_FCEIL,
+ ISD::VP_FFLOOR,
+ ISD::VP_FROUND,
+ ISD::VP_FROUNDEVEN,
+ ISD::VP_FROUNDTOZERO,
+ ISD::VP_FRINT,
+ ISD::VP_FNEARBYINT,
+ ISD::VP_SETCC,
+ ISD::VP_FMINIMUM,
+ ISD::VP_FMAXIMUM,
+ ISD::VP_REDUCE_FMINIMUM,
+ ISD::VP_REDUCE_FMAXIMUM};
// Sets common operation actions on RVV floating-point vector types.
const auto SetCommonVFPActions = [&](MVT VT) {
@@ -1093,20 +1094,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
- // Custom split nxv32f16 since nxv32f32 if not legal.
+ // Custom split nxv32f16 since nxv32f32 is not legal.
if (VT == MVT::nxv32f16) {
- setOperationAction(ZvfhminPromoteOps, VT, Custom);
- setOperationAction(ZvfhminPromoteVPOps, VT, Custom);
+ setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom);
+ setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom);
continue;
}
// Add more promote ops.
MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
- setOperationPromotedToType(ZvfhminPromoteOps, VT, F32VecVT);
- setOperationPromotedToType(ZvfhminPromoteVPOps, VT, F32VecVT);
+ setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT);
+ setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT);
}
}
- // TODO: Could we merge some code with zvfhmin?
+ // TODO: merge with zvfhmin
if (Subtarget.hasVInstructionsBF16Minimal()) {
for (MVT VT : BF16VecVTs) {
if (!isTypeLegal(VT))
@@ -1135,7 +1136,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
- // TODO: Promote to fp32.
+ // Custom split nxv32f16 since nxv32f32 is not legal.
+ if (VT == MVT::nxv32bf16) {
+ setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom);
+ setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom);
+ continue;
+ }
+ // Add more promote ops.
+ MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT);
+ setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT);
}
}
@@ -1371,8 +1381,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// TODO: could split the f16 vector into two vectors and do promotion.
if (!isTypeLegal(F32VecVT))
continue;
- setOperationPromotedToType(ZvfhminPromoteOps, VT, F32VecVT);
- setOperationPromotedToType(ZvfhminPromoteVPOps, VT, F32VecVT);
+ setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT);
+ setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT);
continue;
}
@@ -6329,6 +6339,17 @@ static bool hasMaskOp(unsigned Opcode) {
return false;
}
+static bool isPromotedOpNeedingSplit(SDValue Op,
+ const RISCVSubtarget &Subtarget) {
+ if (Op.getValueType() == MVT::nxv32f16 &&
+ (Subtarget.hasVInstructionsF16Minimal() &&
+ !Subtarget.hasVInstructionsF16()))
+ return true;
+ if (Op.getValueType() == MVT::nxv32bf16)
+ return true;
+ return false;
+}
+
static SDValue SplitVectorOp(SDValue Op, SelectionDAG &DAG) {
auto [LoVT, HiVT] = DAG.GetSplitDestVTs(Op.getValueType());
SDLoc DL(Op);
@@ -6666,9 +6687,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
}
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget);
case ISD::FP_EXTEND:
@@ -6684,8 +6703,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
(Subtarget.hasVInstructionsF16Minimal() &&
!Subtarget.hasVInstructionsF16())) ||
Op.getValueType().getScalarType() == MVT::bf16)) {
- if (Op.getValueType() == MVT::nxv32f16 ||
- Op.getValueType() == MVT::nxv32bf16)
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
// int -> f32
SDLoc DL(Op);
@@ -6705,8 +6723,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
(Subtarget.hasVInstructionsF16Minimal() &&
!Subtarget.hasVInstructionsF16())) ||
Op1.getValueType().getScalarType() == MVT::bf16)) {
- if (Op1.getValueType() == MVT::nxv32f16 ||
- Op1.getValueType() == MVT::nxv32bf16)
+ if (isPromotedOpNeedingSplit(Op1, Subtarget))
return SplitVectorOp(Op, DAG);
// [b]f16 -> f32
SDLoc DL(Op);
@@ -6926,9 +6943,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::FRINT:
case ISD::FROUND:
case ISD::FROUNDEVEN:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
return lowerFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
case ISD::LRINT:
@@ -6986,9 +7001,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::VP_REDUCE_FMAX:
case ISD::VP_REDUCE_FMINIMUM:
case ISD::VP_REDUCE_FMAXIMUM:
- if (Op.getOperand(1).getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op.getOperand(1), Subtarget))
return SplitVectorReductionOp(Op, DAG);
return lowerVPREDUCE(Op, DAG);
case ISD::VP_REDUCE_AND:
@@ -7235,9 +7248,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return DAG.getSetCC(DL, VT, RHS, LHS, CCVal);
}
- if (Op.getOperand(0).getSimpleValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget))
return SplitVectorOp(Op, DAG);
return lowerFixedLengthVectorSetccToRVV(Op, DAG);
@@ -7279,9 +7290,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::FMA:
case ISD::FMINNUM:
case ISD::FMAXNUM:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
[[fallthrough]];
case ISD::AVGFLOORS:
@@ -7329,9 +7338,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::FCOPYSIGN:
if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16)
return lowerFCOPYSIGN(Op, DAG, Subtarget);
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG);
case ISD::STRICT_FADD:
@@ -7340,9 +7347,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::STRICT_FDIV:
case ISD::STRICT_FSQRT:
case ISD::STRICT_FMA:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitStrictFPVectorOp(Op, DAG);
return lowerToScalableOp(Op, DAG);
case ISD::STRICT_FSETCC:
@@ -7399,9 +7404,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::VP_FMINNUM:
case ISD::VP_FMAXNUM:
case ISD::VP_FCOPYSIGN:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVPOp(Op, DAG);
[[fallthrough]];
case ISD::VP_SRA:
@@ -7427,8 +7430,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
(Subtarget.hasVInstructionsF16Minimal() &&
!Subtarget.hasVInstructionsF16())) ||
Op.getValueType().getScalarType() == MVT::bf16)) {
- if (Op.getValueType() == MVT::nxv32f16 ||
- Op.getValueType() == MVT::nxv32bf16)
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
// int -> f32
SDLoc DL(Op);
@@ -7448,8 +7450,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
(Subtarget.hasVInstructionsF16Minimal() &&
!Subtarget.hasVInstructionsF16())) ||
Op1.getValueType().getScalarType() == MVT::bf16)) {
- if (Op1.getValueType() == MVT::nxv32f16 ||
- Op1.getValueType() == MVT::nxv32bf16)
+ if (isPromotedOpNeedingSplit(Op1, Subtarget))
return SplitVectorOp(Op, DAG);
// [b]f16 -> f32
SDLoc DL(Op);
@@ -7462,9 +7463,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
}
return lowerVPFPIntConvOp(Op, DAG);
case ISD::VP_SETCC:
- if (Op.getOperand(0).getSimpleValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op.getOperand(0), Subtarget))
return SplitVPOp(Op, DAG);
if (Op.getOperand(0).getSimpleValueType().getVectorElementType() == MVT::i1)
return lowerVPSetCCMaskOp(Op, DAG);
@@ -7499,16 +7498,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::VP_FROUND:
case ISD::VP_FROUNDEVEN:
case ISD::VP_FROUNDTOZERO:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVPOp(Op, DAG);
return lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
case ISD::VP_FMAXIMUM:
case ISD::VP_FMINIMUM:
- if (Op.getValueType() == MVT::nxv32f16 &&
- (Subtarget.hasVInstructionsF16Minimal() &&
- !Subtarget.hasVInstructionsF16()))
+ if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVPOp(Op, DAG);
return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget);
case ISD::EXPERIMENTAL_VP_SPLICE:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index d613e4ee0bc256..15cff650765efa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -1,22 +1,428 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 3
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vmv1r.v v8, v16
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ceil_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 3
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.ceil.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -57,8 +463,8 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -97,8 +503,8 @@ declare <vscale x 2 x half> @llvm.vp.ceil.nxv2f16(<vscale x 2 x half>, <vscale x
define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -139,8 +545,8 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -179,8 +585,8 @@ declare <vscale x 4 x half> @llvm.vp.ceil.nxv4f16(<vscale x 4 x half>, <vscale x
define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -223,8 +629,8 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -263,8 +669,8 @@ declare <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half>, <vscale x
define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -309,8 +715,8 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -349,8 +755,8 @@ declare <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half>, <vscal
define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -395,8 +801,8 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -435,8 +841,8 @@ declare <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half>, <vscal
define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -491,10 +897,10 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
@@ -533,8 +939,8 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_ceil_vv_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -586,10 +992,10 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -834,8 +1240,8 @@ declare <vscale x 1 x double> @llvm.vp.ceil.nxv1f64(<vscale x 1 x double>, <vsca
define <vscale x 1 x double> @vp_ceil_vv_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -855,8 +1261,8 @@ define <vscale x 1 x double> @vp_ceil_vv_nxv1f64(<vscale x 1 x double> %va, <vsc
define <vscale x 1 x double> @vp_ceil_vv_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -876,8 +1282,8 @@ declare <vscale x 2 x double> @llvm.vp.ceil.nxv2f64(<vscale x 2 x double>, <vsca
define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -899,8 +1305,8 @@ define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vsc
define <vscale x 2 x double> @vp_ceil_vv_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -920,8 +1326,8 @@ declare <vscale x 4 x double> @llvm.vp.ceil.nxv4f64(<vscale x 4 x double>, <vsca
define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -943,8 +1349,8 @@ define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vsc
define <vscale x 4 x double> @vp_ceil_vv_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -964,8 +1370,8 @@ declare <vscale x 7 x double> @llvm.vp.ceil.nxv7f64(<vscale x 7 x double>, <vsca
define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -987,8 +1393,8 @@ define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vsc
define <vscale x 7 x double> @vp_ceil_vv_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1008,8 +1414,8 @@ declare <vscale x 8 x double> @llvm.vp.ceil.nxv8f64(<vscale x 8 x double>, <vsca
define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1031,8 +1437,8 @@ define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vsc
define <vscale x 8 x double> @vp_ceil_vv_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1065,8 +1471,8 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1087,10 +1493,10 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1118,8 +1524,8 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1132,10 +1538,10 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
index 111d1d8e07d3bf..ee16b476dc84ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
@@ -1,20 +1,172 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+define <vscale x 1 x bfloat> @ceil_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: ceil_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.ceil.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @ceil_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: ceil_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @ceil_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: ceil_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @ceil_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: ceil_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @ceil_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: ceil_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.ceil.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: ceil_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.ceil.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
+
define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: ceil_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -52,8 +204,8 @@ declare <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: ceil_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -91,8 +243,8 @@ declare <vscale x 2 x half> @llvm.ceil.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: ceil_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -130,8 +282,8 @@ declare <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: ceil_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -169,8 +321,8 @@ declare <vscale x 8 x half> @llvm.ceil.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: ceil_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -208,8 +360,8 @@ declare <vscale x 16 x half> @llvm.ceil.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: ceil_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -359,8 +511,8 @@ declare <vscale x 16 x float> @llvm.ceil.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: ceil_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -379,8 +531,8 @@ declare <vscale x 1 x double> @llvm.ceil.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: ceil_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -399,8 +551,8 @@ declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: ceil_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -419,8 +571,8 @@ declare <vscale x 4 x double> @llvm.ceil.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: ceil_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
index 97d84e91744038..00e21ce8992b0c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
@@ -1,20 +1,178 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+define <vscale x 1 x bfloat> @floor_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: floor_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.floor.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+declare <vscale x 1 x bfloat> @llvm.floor.nxv1bf16(<vscale x 1 x bfloat>)
+
+define <vscale x 2 x bfloat> @floor_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: floor_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.floor.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+declare <vscale x 2 x bfloat> @llvm.floor.nxv2bf16(<vscale x 2 x bfloat>)
+
+define <vscale x 4 x bfloat> @floor_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: floor_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.floor.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+declare <vscale x 4 x bfloat> @llvm.floor.nxv4bf16(<vscale x 4 x bfloat>)
+
+define <vscale x 8 x bfloat> @floor_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: floor_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.floor.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+declare <vscale x 8 x bfloat> @llvm.floor.nxv8bf16(<vscale x 8 x bfloat>)
+
+define <vscale x 16 x bfloat> @floor_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: floor_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.floor.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+declare <vscale x 16 x bfloat> @llvm.floor.nxv16bf16(<vscale x 16 x bfloat>)
+
+define <vscale x 32 x bfloat> @floor_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: floor_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.floor.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
+declare <vscale x 32 x bfloat> @llvm.floor.nxv32bf16(<vscale x 32 x bfloat>)
+
define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: floor_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -52,8 +210,8 @@ declare <vscale x 1 x half> @llvm.floor.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: floor_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -91,8 +249,8 @@ declare <vscale x 2 x half> @llvm.floor.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: floor_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -130,8 +288,8 @@ declare <vscale x 4 x half> @llvm.floor.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: floor_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -169,8 +327,8 @@ declare <vscale x 8 x half> @llvm.floor.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: floor_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -208,8 +366,8 @@ declare <vscale x 16 x half> @llvm.floor.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: floor_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -359,8 +517,8 @@ declare <vscale x 16 x float> @llvm.floor.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: floor_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -379,8 +537,8 @@ declare <vscale x 1 x double> @llvm.floor.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: floor_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -399,8 +557,8 @@ declare <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: floor_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -419,8 +577,8 @@ declare <vscale x 4 x double> @llvm.floor.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: floor_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index 45334ea8648f74..03d1fb6c8d297f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -1,22 +1,428 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.floor.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_floor_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.floor.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_floor_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.floor.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.floor.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_floor_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.floor.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_floor_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.floor.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.floor.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_floor_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.floor.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_floor_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.floor.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.floor.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_floor_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.floor.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_floor_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.floor.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.floor.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_floor_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.floor.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_floor_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.floor.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 2
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vmv1r.v v8, v16
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_floor_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 2
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 2
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.floor.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -57,8 +463,8 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -97,8 +503,8 @@ declare <vscale x 2 x half> @llvm.vp.floor.nxv2f16(<vscale x 2 x half>, <vscale
define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -139,8 +545,8 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -179,8 +585,8 @@ declare <vscale x 4 x half> @llvm.vp.floor.nxv4f16(<vscale x 4 x half>, <vscale
define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -223,8 +629,8 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -263,8 +669,8 @@ declare <vscale x 8 x half> @llvm.vp.floor.nxv8f16(<vscale x 8 x half>, <vscale
define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -309,8 +715,8 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -349,8 +755,8 @@ declare <vscale x 16 x half> @llvm.vp.floor.nxv16f16(<vscale x 16 x half>, <vsca
define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -395,8 +801,8 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -435,8 +841,8 @@ declare <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half>, <vsca
define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -491,10 +897,10 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
@@ -533,8 +939,8 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_floor_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -586,10 +992,10 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -834,8 +1240,8 @@ declare <vscale x 1 x double> @llvm.vp.floor.nxv1f64(<vscale x 1 x double>, <vsc
define <vscale x 1 x double> @vp_floor_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -855,8 +1261,8 @@ define <vscale x 1 x double> @vp_floor_nxv1f64(<vscale x 1 x double> %va, <vscal
define <vscale x 1 x double> @vp_floor_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -876,8 +1282,8 @@ declare <vscale x 2 x double> @llvm.vp.floor.nxv2f64(<vscale x 2 x double>, <vsc
define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -899,8 +1305,8 @@ define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscal
define <vscale x 2 x double> @vp_floor_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -920,8 +1326,8 @@ declare <vscale x 4 x double> @llvm.vp.floor.nxv4f64(<vscale x 4 x double>, <vsc
define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -943,8 +1349,8 @@ define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscal
define <vscale x 4 x double> @vp_floor_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -964,8 +1370,8 @@ declare <vscale x 7 x double> @llvm.vp.floor.nxv7f64(<vscale x 7 x double>, <vsc
define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -987,8 +1393,8 @@ define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscal
define <vscale x 7 x double> @vp_floor_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1008,8 +1414,8 @@ declare <vscale x 8 x double> @llvm.vp.floor.nxv8f64(<vscale x 8 x double>, <vsc
define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1031,8 +1437,8 @@ define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscal
define <vscale x 8 x double> @vp_floor_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_floor_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1065,8 +1471,8 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1087,10 +1493,10 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1118,8 +1524,8 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1132,10 +1538,10 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index 05896d8ef6ffdf..d8c3ab27cfad12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -1,21 +1,200 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.maximum.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>)
+
+define <vscale x 1 x bfloat> @vfmax_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv1bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
+; CHECK-NEXT: vmfeq.vv v8, v10, v10
+; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vfmax.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.maximum.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+
+define <vscale x 2 x bfloat> @vfmax_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv2bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
+; CHECK-NEXT: vmfeq.vv v8, v10, v10
+; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vfmax.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+
+define <vscale x 4 x bfloat> @vfmax_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv4bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vmfeq.vv v8, v10, v10
+; CHECK-NEXT: vmerge.vvm v14, v12, v10, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0
+; CHECK-NEXT: vfmax.vv v10, v8, v14
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define <vscale x 8 x bfloat> @vfmax_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv8bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vmerge.vvm v20, v16, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0
+; CHECK-NEXT: vfmax.vv v12, v8, v20
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>)
+
+define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv16bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v8, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
+
+define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
+; CHECK-LABEL: vfmax_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v0, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v3, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmax.vv v8, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfmax.vv v16, v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
+ ret <vscale x 32 x bfloat> %v
+}
+
declare <vscale x 1 x half> @llvm.maximum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>)
define <vscale x 1 x half> @vfmax_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1 x half> %b) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index ab07fff59b218f..320db35770cb82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -1,13 +1,541 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 \
+; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 \
+; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.maximum.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfmax.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.maximum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v11, v11
+; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vfmax.vv v9, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.maximum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.maximum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfmax.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.maximum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v11, v11
+; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vfmax.vv v9, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.maximum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.maximum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmerge.vvm v16, v12, v14, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vv v8, v14, v14, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfmax.vv v10, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.maximum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0
+; CHECK-NEXT: vfmax.vv v10, v8, v14
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.maximum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.maximum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmerge.vvm v24, v16, v20, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfeq.vv v8, v20, v20, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v20, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfmax.vv v12, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.maximum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v16
+; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0
+; CHECK-NEXT: vfmax.vv v12, v8, v20
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.maximum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.maximum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.maximum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v8, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.maximum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.maximum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 34
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: vmv8r.v v0, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv8r.v v0, v16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 34
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.maximum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v7, v24, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv4r.v v8, v16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmax.vv v16, v16, v8, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v4, v16
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v3, v16, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vfmax.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16
+; CHECK-NEXT: vmv8r.v v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.maximum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.maximum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -509,10 +1037,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -656,10 +1184,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfmax.vv v16, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -1093,10 +1621,10 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB28_2
+; CHECK-NEXT: bltu a2, a1, .LBB40_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB28_2:
+; CHECK-NEXT: .LBB40_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 18
; CHECK-NEXT: mul a0, a0, a1
@@ -1202,10 +1730,10 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vfmax.vv v8, v16, v8
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB29_2
+; CHECK-NEXT: bltu a2, a1, .LBB41_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB29_2:
+; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index e9425939249878..2371840002f40b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -1,21 +1,200 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \
; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.minimum.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>)
+
+define <vscale x 1 x bfloat> @vfmin_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv1bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
+; CHECK-NEXT: vmfeq.vv v8, v10, v10
+; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vfmin.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.minimum.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+
+define <vscale x 2 x bfloat> @vfmin_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv2bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
+; CHECK-NEXT: vmfeq.vv v8, v10, v10
+; CHECK-NEXT: vmerge.vvm v11, v9, v10, v0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vfmin.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+
+define <vscale x 4 x bfloat> @vfmin_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv4bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vmfeq.vv v8, v10, v10
+; CHECK-NEXT: vmerge.vvm v14, v12, v10, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0
+; CHECK-NEXT: vfmin.vv v10, v8, v14
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define <vscale x 8 x bfloat> @vfmin_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv8bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vmerge.vvm v20, v16, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0
+; CHECK-NEXT: vfmin.vv v12, v8, v20
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>)
+
+define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv16bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v24, v24
+; CHECK-NEXT: vmfeq.vv v7, v16, v16
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v8, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
+
+define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
+; CHECK-LABEL: vfmin_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v0, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: vmfeq.vv v3, v24, v24
+; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT: vfmin.vv v8, v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vmfeq.vv v7, v8, v8
+; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfmin.vv v16, v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
+ ret <vscale x 32 x bfloat> %v
+}
+
declare <vscale x 1 x half> @llvm.minimum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>)
define <vscale x 1 x half> @vfmin_nxv1f16_vv(<vscale x 1 x half> %a, <vscale x 1 x half> %b) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index fc5b11284dab0c..03e3969f9141e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -1,13 +1,541 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 \
+; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 \
+; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.minimum.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfmin.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.minimum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v11, v11
+; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vfmin.vv v9, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.minimum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.minimum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v11, v11, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmerge.vvm v9, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfmin.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.minimum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v11, v11
+; CHECK-NEXT: vmerge.vvm v9, v10, v11, v0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v11, v10, v0
+; CHECK-NEXT: vfmin.vv v9, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.minimum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.minimum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmerge.vvm v16, v12, v14, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmfeq.vv v8, v14, v14, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v14, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfmin.vv v10, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.minimum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vmerge.vvm v14, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0
+; CHECK-NEXT: vfmin.vv v10, v8, v14
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.minimum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.minimum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmerge.vvm v24, v16, v20, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmfeq.vv v8, v20, v20, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v20, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfmin.vv v12, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.minimum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v16
+; CHECK-NEXT: vmerge.vvm v20, v12, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0
+; CHECK-NEXT: vfmin.vv v12, v8, v20
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.minimum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.minimum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.minimum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v7, v24, v24
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v8, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.minimum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.minimum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 34
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: vmv8r.v v0, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv8r.v v0, v16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmfeq.vv v12, v16, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 34
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.minimum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v7, v24, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv4r.v v8, v16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: li a4, 24
+; CHECK-NEXT: mul a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfmin.vv v16, v16, v8, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v4, v16
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: li a2, 24
+; CHECK-NEXT: mul a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v3, v16, v16
+; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT: vmv1r.v v0, v3
+; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT: vfmin.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v0, v16
+; CHECK-NEXT: vmv8r.v v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.minimum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.minimum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -509,10 +1037,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -656,10 +1184,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfmin.vv v16, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -1093,10 +1621,10 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB28_2
+; CHECK-NEXT: bltu a2, a1, .LBB40_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB28_2:
+; CHECK-NEXT: .LBB40_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 18
; CHECK-NEXT: mul a0, a0, a1
@@ -1202,10 +1730,10 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vfmin.vv v8, v16, v8
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a2, a1, .LBB29_2
+; CHECK-NEXT: bltu a2, a1, .LBB41_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB29_2:
+; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
index 0655b9d099cbb7..9498c65ba9a176 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
@@ -1,20 +1,187 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+define <vscale x 1 x bfloat> @nearbyint_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: nearbyint_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @nearbyint_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: nearbyint_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @nearbyint_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: nearbyint_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @nearbyint_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: nearbyint_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @nearbyint_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: nearbyint_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @nearbyint_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: nearbyint_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
+
define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: nearbyint_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -52,8 +219,8 @@ declare <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: nearbyint_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -91,8 +258,8 @@ declare <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: nearbyint_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -130,8 +297,8 @@ declare <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: nearbyint_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -169,8 +336,8 @@ declare <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: nearbyint_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -208,8 +375,8 @@ declare <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: nearbyint_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -374,8 +541,8 @@ declare <vscale x 16 x float> @llvm.nearbyint.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: nearbyint_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -394,8 +561,8 @@ declare <vscale x 1 x double> @llvm.nearbyint.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: nearbyint_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -414,8 +581,8 @@ declare <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: nearbyint_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -434,8 +601,8 @@ declare <vscale x 4 x double> @llvm.nearbyint.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: nearbyint_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
index ca1f72ee4d524b..7fac8949c5517a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
@@ -1,20 +1,160 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+define <vscale x 1 x bfloat> @rint_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: rint_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.rint.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @rint_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: rint_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.rint.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @rint_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: rint_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.rint.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @rint_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: rint_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.rint.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @rint_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: rint_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.rint.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @rint_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: rint_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.rint.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
+
define <vscale x 1 x half> @rint_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: rint_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -48,8 +188,8 @@ declare <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @rint_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: rint_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -83,8 +223,8 @@ declare <vscale x 2 x half> @llvm.rint.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @rint_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: rint_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -118,8 +258,8 @@ declare <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @rint_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: rint_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -153,8 +293,8 @@ declare <vscale x 8 x half> @llvm.rint.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @rint_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: rint_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -188,8 +328,8 @@ declare <vscale x 16 x half> @llvm.rint.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @rint_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: rint_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -325,8 +465,8 @@ declare <vscale x 16 x float> @llvm.rint.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @rint_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: rint_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -343,8 +483,8 @@ declare <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @rint_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: rint_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -361,8 +501,8 @@ declare <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @rint_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: rint_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -379,8 +519,8 @@ declare <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @rint_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: rint_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
index a39abcc6ed0e27..193773b0c89c9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
@@ -1,22 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
; This file tests the code generation for `llvm.round.*` on scalable vector type.
+define <vscale x 1 x bfloat> @round_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: round_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.round.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @round_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: round_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.round.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @round_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: round_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.round.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @round_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: round_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.round.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @round_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: round_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.round.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @round_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: round_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.round.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
+
define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: round_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -54,8 +206,8 @@ declare <vscale x 1 x half> @llvm.round.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: round_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -93,8 +245,8 @@ declare <vscale x 2 x half> @llvm.round.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: round_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -132,8 +284,8 @@ declare <vscale x 4 x half> @llvm.round.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: round_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -171,8 +323,8 @@ declare <vscale x 8 x half> @llvm.round.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: round_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -210,8 +362,8 @@ declare <vscale x 16 x half> @llvm.round.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: round_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -361,8 +513,8 @@ declare <vscale x 16 x float> @llvm.round.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: round_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -381,8 +533,8 @@ declare <vscale x 1 x double> @llvm.round.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: round_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -401,8 +553,8 @@ declare <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: round_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -421,8 +573,8 @@ declare <vscale x 4 x double> @llvm.round.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: round_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
index 52ad443bfdebda..052ee2d3a43cf2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
@@ -1,22 +1,173 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
; This file tests the code generation for `llvm.roundeven.*` on scalable vector type.
+define <vscale x 1 x bfloat> @roundeven_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: roundeven_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.roundeven.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @roundeven_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: roundeven_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @roundeven_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: roundeven_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @roundeven_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: roundeven_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @roundeven_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: roundeven_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.roundeven.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @roundeven_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: roundeven_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.roundeven.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: roundeven_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -54,8 +205,8 @@ declare <vscale x 1 x half> @llvm.roundeven.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: roundeven_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -93,8 +244,8 @@ declare <vscale x 2 x half> @llvm.roundeven.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: roundeven_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -132,8 +283,8 @@ declare <vscale x 4 x half> @llvm.roundeven.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: roundeven_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -171,8 +322,8 @@ declare <vscale x 8 x half> @llvm.roundeven.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: roundeven_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -210,8 +361,8 @@ declare <vscale x 16 x half> @llvm.roundeven.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: roundeven_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -361,8 +512,8 @@ declare <vscale x 16 x float> @llvm.roundeven.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: roundeven_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -381,8 +532,8 @@ declare <vscale x 1 x double> @llvm.roundeven.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: roundeven_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -401,8 +552,8 @@ declare <vscale x 2 x double> @llvm.roundeven.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: roundeven_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -421,8 +572,8 @@ declare <vscale x 4 x double> @llvm.roundeven.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: roundeven_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
index 971424e8cea09e..b29b24a9ce7b25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
@@ -1,20 +1,160 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+define <vscale x 1 x bfloat> @trunc_nxv1bf16(<vscale x 1 x bfloat> %x) {
+; CHECK-LABEL: trunc_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x bfloat> @llvm.trunc.nxv1bf16(<vscale x 1 x bfloat> %x)
+ ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @trunc_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: trunc_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16(<vscale x 2 x bfloat> %x)
+ ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @trunc_nxv4bf16(<vscale x 4 x bfloat> %x) {
+; CHECK-LABEL: trunc_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16(<vscale x 4 x bfloat> %x)
+ ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @trunc_nxv8bf16(<vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: trunc_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16(<vscale x 8 x bfloat> %x)
+ ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @trunc_nxv16bf16(<vscale x 16 x bfloat> %x) {
+; CHECK-LABEL: trunc_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x bfloat> @llvm.trunc.nxv16bf16(<vscale x 16 x bfloat> %x)
+ ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @trunc_nxv32bf16(<vscale x 32 x bfloat> %x) {
+; CHECK-LABEL: trunc_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: ret
+ %a = call <vscale x 32 x bfloat> @llvm.trunc.nxv32bf16(<vscale x 32 x bfloat> %x)
+ ret <vscale x 32 x bfloat> %a
+}
+
define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) {
; ZVFH-LABEL: trunc_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI6_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -48,8 +188,8 @@ declare <vscale x 1 x half> @llvm.trunc.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) {
; ZVFH-LABEL: trunc_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI7_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -83,8 +223,8 @@ declare <vscale x 2 x half> @llvm.trunc.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) {
; ZVFH-LABEL: trunc_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI8_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -118,8 +258,8 @@ declare <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) {
; ZVFH-LABEL: trunc_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI9_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -153,8 +293,8 @@ declare <vscale x 8 x half> @llvm.trunc.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) {
; ZVFH-LABEL: trunc_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI10_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -188,8 +328,8 @@ declare <vscale x 16 x half> @llvm.trunc.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) {
; ZVFH-LABEL: trunc_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a0, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0)
+; ZVFH-NEXT: lui a0, %hi(.LCPI11_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0)
; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -325,8 +465,8 @@ declare <vscale x 16 x float> @llvm.trunc.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @trunc_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: trunc_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -343,8 +483,8 @@ declare <vscale x 1 x double> @llvm.trunc.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @trunc_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: trunc_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -361,8 +501,8 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @trunc_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: trunc_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -379,8 +519,8 @@ declare <vscale x 4 x double> @llvm.trunc.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @trunc_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: trunc_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI20_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0)
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index a3ea462b6a7376..5aa773b01e6926 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -1,20 +1,420 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.nearbyint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.nearbyint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: frflags a2
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: fsflags a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_nearbyint_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: frflags a2
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: fsflags a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: frflags a0
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: fsflags a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -55,8 +455,8 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -95,8 +495,8 @@ declare <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half>, <vsc
define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -137,8 +537,8 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -177,8 +577,8 @@ declare <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half>, <vsc
define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -221,8 +621,8 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -261,8 +661,8 @@ declare <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half>, <vsc
define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -307,8 +707,8 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -347,8 +747,8 @@ declare <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half>, <
define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -393,8 +793,8 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -433,8 +833,8 @@ declare <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half>, <
define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -489,10 +889,10 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
@@ -523,8 +923,8 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_nearbyint_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -576,10 +976,10 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -824,8 +1224,8 @@ declare <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double>,
define <vscale x 1 x double> @vp_nearbyint_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -845,8 +1245,8 @@ define <vscale x 1 x double> @vp_nearbyint_nxv1f64(<vscale x 1 x double> %va, <v
define <vscale x 1 x double> @vp_nearbyint_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -866,8 +1266,8 @@ declare <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double>,
define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -889,8 +1289,8 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <v
define <vscale x 2 x double> @vp_nearbyint_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -910,8 +1310,8 @@ declare <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double>,
define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -933,8 +1333,8 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <v
define <vscale x 4 x double> @vp_nearbyint_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -954,8 +1354,8 @@ declare <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double>,
define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -977,8 +1377,8 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <v
define <vscale x 7 x double> @vp_nearbyint_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -998,8 +1398,8 @@ declare <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double>,
define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1021,8 +1421,8 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <v
define <vscale x 8 x double> @vp_nearbyint_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_nearbyint_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1049,8 +1449,8 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1067,10 +1467,10 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; CHECK-NEXT: fsflags a2
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1094,8 +1494,8 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1108,10 +1508,10 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; CHECK-NEXT: fsflags a2
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 88bd92c6ec1612..a454f9dd97cebf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -1,20 +1,397 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.rint.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_rint_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.rint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_rint_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.rint.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.rint.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_rint_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.rint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_rint_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.rint.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.rint.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_rint_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.rint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_rint_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.rint.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.rint.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_rint_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.rint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_rint_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.rint.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.rint.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_rint_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.rint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_rint_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.rint.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmv1r.v v8, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_rint_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.rint.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_rint_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -51,8 +428,8 @@ define <vscale x 1 x half> @vp_rint_nxv1f16(<vscale x 1 x half> %va, <vscale x 1
define <vscale x 1 x half> @vp_rint_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -87,8 +464,8 @@ declare <vscale x 2 x half> @llvm.vp.rint.nxv2f16(<vscale x 2 x half>, <vscale x
define <vscale x 2 x half> @vp_rint_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -125,8 +502,8 @@ define <vscale x 2 x half> @vp_rint_nxv2f16(<vscale x 2 x half> %va, <vscale x 2
define <vscale x 2 x half> @vp_rint_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -161,8 +538,8 @@ declare <vscale x 4 x half> @llvm.vp.rint.nxv4f16(<vscale x 4 x half>, <vscale x
define <vscale x 4 x half> @vp_rint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -201,8 +578,8 @@ define <vscale x 4 x half> @vp_rint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4
define <vscale x 4 x half> @vp_rint_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -237,8 +614,8 @@ declare <vscale x 8 x half> @llvm.vp.rint.nxv8f16(<vscale x 8 x half>, <vscale x
define <vscale x 8 x half> @vp_rint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -279,8 +656,8 @@ define <vscale x 8 x half> @vp_rint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8
define <vscale x 8 x half> @vp_rint_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -315,8 +692,8 @@ declare <vscale x 16 x half> @llvm.vp.rint.nxv16f16(<vscale x 16 x half>, <vscal
define <vscale x 16 x half> @vp_rint_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -357,8 +734,8 @@ define <vscale x 16 x half> @vp_rint_nxv16f16(<vscale x 16 x half> %va, <vscale
define <vscale x 16 x half> @vp_rint_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -393,8 +770,8 @@ declare <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half>, <vscal
define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -445,10 +822,10 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
@@ -482,8 +859,8 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_rint_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -531,10 +908,10 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -757,8 +1134,8 @@ declare <vscale x 1 x double> @llvm.vp.rint.nxv1f64(<vscale x 1 x double>, <vsca
define <vscale x 1 x double> @vp_rint_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -776,8 +1153,8 @@ define <vscale x 1 x double> @vp_rint_nxv1f64(<vscale x 1 x double> %va, <vscale
define <vscale x 1 x double> @vp_rint_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -795,8 +1172,8 @@ declare <vscale x 2 x double> @llvm.vp.rint.nxv2f64(<vscale x 2 x double>, <vsca
define <vscale x 2 x double> @vp_rint_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -816,8 +1193,8 @@ define <vscale x 2 x double> @vp_rint_nxv2f64(<vscale x 2 x double> %va, <vscale
define <vscale x 2 x double> @vp_rint_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -835,8 +1212,8 @@ declare <vscale x 4 x double> @llvm.vp.rint.nxv4f64(<vscale x 4 x double>, <vsca
define <vscale x 4 x double> @vp_rint_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -856,8 +1233,8 @@ define <vscale x 4 x double> @vp_rint_nxv4f64(<vscale x 4 x double> %va, <vscale
define <vscale x 4 x double> @vp_rint_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -875,8 +1252,8 @@ declare <vscale x 7 x double> @llvm.vp.rint.nxv7f64(<vscale x 7 x double>, <vsca
define <vscale x 7 x double> @vp_rint_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -896,8 +1273,8 @@ define <vscale x 7 x double> @vp_rint_nxv7f64(<vscale x 7 x double> %va, <vscale
define <vscale x 7 x double> @vp_rint_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -915,8 +1292,8 @@ declare <vscale x 8 x double> @llvm.vp.rint.nxv8f64(<vscale x 8 x double>, <vsca
define <vscale x 8 x double> @vp_rint_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -936,8 +1313,8 @@ define <vscale x 8 x double> @vp_rint_nxv8f64(<vscale x 8 x double> %va, <vscale
define <vscale x 8 x double> @vp_rint_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_rint_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -968,8 +1345,8 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -987,10 +1364,10 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1016,8 +1393,8 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1028,10 +1405,10 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 1ddadcc4937361..a4936483e8a152 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -1,20 +1,428 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.round.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_round_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.round.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_round_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.round.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.round.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_round_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.round.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_round_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.round.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.round.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_round_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.round.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_round_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.round.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.round.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_round_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.round.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_round_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.round.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.round.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_round_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.round.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_round_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.round.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 4
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vmv1r.v v8, v16
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_round_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 4
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 4
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.round.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -55,8 +463,8 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
define <vscale x 1 x half> @vp_round_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -95,8 +503,8 @@ declare <vscale x 2 x half> @llvm.vp.round.nxv2f16(<vscale x 2 x half>, <vscale
define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -137,8 +545,8 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
define <vscale x 2 x half> @vp_round_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -177,8 +585,8 @@ declare <vscale x 4 x half> @llvm.vp.round.nxv4f16(<vscale x 4 x half>, <vscale
define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -221,8 +629,8 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
define <vscale x 4 x half> @vp_round_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -261,8 +669,8 @@ declare <vscale x 8 x half> @llvm.vp.round.nxv8f16(<vscale x 8 x half>, <vscale
define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -307,8 +715,8 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
define <vscale x 8 x half> @vp_round_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -347,8 +755,8 @@ declare <vscale x 16 x half> @llvm.vp.round.nxv16f16(<vscale x 16 x half>, <vsca
define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -393,8 +801,8 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
define <vscale x 16 x half> @vp_round_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -433,8 +841,8 @@ declare <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half>, <vsca
define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -489,10 +897,10 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
@@ -531,8 +939,8 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_round_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -584,10 +992,10 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -832,8 +1240,8 @@ declare <vscale x 1 x double> @llvm.vp.round.nxv1f64(<vscale x 1 x double>, <vsc
define <vscale x 1 x double> @vp_round_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -853,8 +1261,8 @@ define <vscale x 1 x double> @vp_round_nxv1f64(<vscale x 1 x double> %va, <vscal
define <vscale x 1 x double> @vp_round_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -874,8 +1282,8 @@ declare <vscale x 2 x double> @llvm.vp.round.nxv2f64(<vscale x 2 x double>, <vsc
define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -897,8 +1305,8 @@ define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscal
define <vscale x 2 x double> @vp_round_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -918,8 +1326,8 @@ declare <vscale x 4 x double> @llvm.vp.round.nxv4f64(<vscale x 4 x double>, <vsc
define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -941,8 +1349,8 @@ define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscal
define <vscale x 4 x double> @vp_round_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -962,8 +1370,8 @@ declare <vscale x 7 x double> @llvm.vp.round.nxv7f64(<vscale x 7 x double>, <vsc
define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -985,8 +1393,8 @@ define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscal
define <vscale x 7 x double> @vp_round_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1006,8 +1414,8 @@ declare <vscale x 8 x double> @llvm.vp.round.nxv8f64(<vscale x 8 x double>, <vsc
define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1029,8 +1437,8 @@ define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscal
define <vscale x 8 x double> @vp_round_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_round_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1063,8 +1471,8 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1085,10 +1493,10 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1116,8 +1524,8 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1130,10 +1538,10 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 8c5a7bb2dea6aa..9857009002eb90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -1,20 +1,428 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.roundeven.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.roundeven.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.roundeven.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.roundeven.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.roundeven.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.roundeven.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.roundeven.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.roundeven.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.roundeven.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.roundeven.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.roundeven.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.roundeven.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.roundeven.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.roundeven.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.roundeven.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 0
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vmv1r.v v8, v16
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundeven_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 0
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.roundeven.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -55,8 +463,8 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
define <vscale x 1 x half> @vp_roundeven_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -95,8 +503,8 @@ declare <vscale x 2 x half> @llvm.vp.roundeven.nxv2f16(<vscale x 2 x half>, <vsc
define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -137,8 +545,8 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
define <vscale x 2 x half> @vp_roundeven_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -177,8 +585,8 @@ declare <vscale x 4 x half> @llvm.vp.roundeven.nxv4f16(<vscale x 4 x half>, <vsc
define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -221,8 +629,8 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
define <vscale x 4 x half> @vp_roundeven_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -261,8 +669,8 @@ declare <vscale x 8 x half> @llvm.vp.roundeven.nxv8f16(<vscale x 8 x half>, <vsc
define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -307,8 +715,8 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
define <vscale x 8 x half> @vp_roundeven_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -347,8 +755,8 @@ declare <vscale x 16 x half> @llvm.vp.roundeven.nxv16f16(<vscale x 16 x half>, <
define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -393,8 +801,8 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
define <vscale x 16 x half> @vp_roundeven_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -433,8 +841,8 @@ declare <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half>, <
define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -489,10 +897,10 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
@@ -531,8 +939,8 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundeven_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -584,10 +992,10 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -832,8 +1240,8 @@ declare <vscale x 1 x double> @llvm.vp.roundeven.nxv1f64(<vscale x 1 x double>,
define <vscale x 1 x double> @vp_roundeven_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -853,8 +1261,8 @@ define <vscale x 1 x double> @vp_roundeven_nxv1f64(<vscale x 1 x double> %va, <v
define <vscale x 1 x double> @vp_roundeven_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -874,8 +1282,8 @@ declare <vscale x 2 x double> @llvm.vp.roundeven.nxv2f64(<vscale x 2 x double>,
define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -897,8 +1305,8 @@ define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <v
define <vscale x 2 x double> @vp_roundeven_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -918,8 +1326,8 @@ declare <vscale x 4 x double> @llvm.vp.roundeven.nxv4f64(<vscale x 4 x double>,
define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -941,8 +1349,8 @@ define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <v
define <vscale x 4 x double> @vp_roundeven_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -962,8 +1370,8 @@ declare <vscale x 7 x double> @llvm.vp.roundeven.nxv7f64(<vscale x 7 x double>,
define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -985,8 +1393,8 @@ define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <v
define <vscale x 7 x double> @vp_roundeven_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1006,8 +1414,8 @@ declare <vscale x 8 x double> @llvm.vp.roundeven.nxv8f64(<vscale x 8 x double>,
define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1029,8 +1437,8 @@ define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <v
define <vscale x 8 x double> @vp_roundeven_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundeven_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1063,8 +1471,8 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1085,10 +1493,10 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1116,8 +1524,8 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1130,10 +1538,10 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 1227e73a024325..11830c924867b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -1,20 +1,428 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.roundtozero.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.roundtozero.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.roundtozero.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.roundtozero.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.roundtozero.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.roundtozero.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.roundtozero.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v12, v10, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.roundtozero.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfabs.v v8, v10
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.roundtozero.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.roundtozero.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v16, v12, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.roundtozero.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfabs.v v8, v12
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.roundtozero.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.roundtozero.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v12, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.roundtozero.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v16
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: fmv.w.x fa5, a0
+; CHECK-NEXT: vmflt.vf v0, v8, fa5
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.roundtozero.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 1
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vmv1r.v v8, v16
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vp_roundtozero_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v16, v16, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
+; CHECK-NEXT: lui a2, 307200
+; CHECK-NEXT: fmv.w.x fa5, a2
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT: fsrmi a2, 1
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT: fsrm a2
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v24, v16
+; CHECK-NEXT: vmflt.vf v0, v24, fa5
+; CHECK-NEXT: fsrmi a0, 1
+; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT: fsrm a0
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.roundtozero.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv1f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI0_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI12_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
@@ -55,8 +463,8 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
define <vscale x 1 x half> @vp_roundtozero_nxv1f16_unmasked(<vscale x 1 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv1f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI1_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI13_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -95,8 +503,8 @@ declare <vscale x 2 x half> @llvm.vp.roundtozero.nxv2f16(<vscale x 2 x half>, <v
define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv2f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI2_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI14_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
@@ -137,8 +545,8 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
define <vscale x 2 x half> @vp_roundtozero_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv2f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI3_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI15_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -177,8 +585,8 @@ declare <vscale x 4 x half> @llvm.vp.roundtozero.nxv4f16(<vscale x 4 x half>, <v
define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv4f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI4_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI16_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu
@@ -221,8 +629,8 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
define <vscale x 4 x half> @vp_roundtozero_nxv4f16_unmasked(<vscale x 4 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv4f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI5_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI17_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vfabs.v v9, v8
; ZVFH-NEXT: vmflt.vf v0, v9, fa5
@@ -261,8 +669,8 @@ declare <vscale x 8 x half> @llvm.vp.roundtozero.nxv8f16(<vscale x 8 x half>, <v
define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv8f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI6_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI18_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI18_0)(a1)
; ZVFH-NEXT: vmv1r.v v10, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8, v0.t
@@ -307,8 +715,8 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
define <vscale x 8 x half> @vp_roundtozero_nxv8f16_unmasked(<vscale x 8 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv8f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI7_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI19_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vfabs.v v10, v8
; ZVFH-NEXT: vmflt.vf v0, v10, fa5
@@ -347,8 +755,8 @@ declare <vscale x 16 x half> @llvm.vp.roundtozero.nxv16f16(<vscale x 16 x half>,
define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv16f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI8_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI20_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a1)
; ZVFH-NEXT: vmv1r.v v12, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8, v0.t
@@ -393,8 +801,8 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
define <vscale x 16 x half> @vp_roundtozero_nxv16f16_unmasked(<vscale x 16 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv16f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI9_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI21_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFH-NEXT: vfabs.v v12, v8
; ZVFH-NEXT: vmflt.vf v0, v12, fa5
@@ -433,8 +841,8 @@ declare <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half>,
define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv32f16:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI10_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI22_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a1)
; ZVFH-NEXT: vmv1r.v v16, v0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v24, v8, v0.t
@@ -489,10 +897,10 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
@@ -531,8 +939,8 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x half> %va, i32 zeroext %evl) {
; ZVFH-LABEL: vp_roundtozero_nxv32f16_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: lui a1, %hi(.LCPI11_0)
-; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a1)
+; ZVFH-NEXT: lui a1, %hi(.LCPI23_0)
+; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1)
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfabs.v v16, v8
; ZVFH-NEXT: vmflt.vf v0, v16, fa5
@@ -584,10 +992,10 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -832,8 +1240,8 @@ declare <vscale x 1 x double> @llvm.vp.roundtozero.nxv1f64(<vscale x 1 x double>
define <vscale x 1 x double> @vp_roundtozero_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI22_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
@@ -853,8 +1261,8 @@ define <vscale x 1 x double> @vp_roundtozero_nxv1f64(<vscale x 1 x double> %va,
define <vscale x 1 x double> @vp_roundtozero_nxv1f64_unmasked(<vscale x 1 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv1f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, fa5
@@ -874,8 +1282,8 @@ declare <vscale x 2 x double> @llvm.vp.roundtozero.nxv2f64(<vscale x 2 x double>
define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI24_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI36_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a1)
; CHECK-NEXT: vmv1r.v v10, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v8, v0.t
@@ -897,8 +1305,8 @@ define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va,
define <vscale x 2 x double> @vp_roundtozero_nxv2f64_unmasked(<vscale x 2 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv2f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI25_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI37_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, fa5
@@ -918,8 +1326,8 @@ declare <vscale x 4 x double> @llvm.vp.roundtozero.nxv4f64(<vscale x 4 x double>
define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI26_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI38_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a1)
; CHECK-NEXT: vmv1r.v v12, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v8, v0.t
@@ -941,8 +1349,8 @@ define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va,
define <vscale x 4 x double> @vp_roundtozero_nxv4f64_unmasked(<vscale x 4 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv4f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI27_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, fa5
@@ -962,8 +1370,8 @@ declare <vscale x 7 x double> @llvm.vp.roundtozero.nxv7f64(<vscale x 7 x double>
define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv7f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI28_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI28_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI40_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -985,8 +1393,8 @@ define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va,
define <vscale x 7 x double> @vp_roundtozero_nxv7f64_unmasked(<vscale x 7 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv7f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI29_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI29_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI41_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1006,8 +1414,8 @@ declare <vscale x 8 x double> @llvm.vp.roundtozero.nxv8f64(<vscale x 8 x double>
define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI30_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI30_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI42_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a1)
; CHECK-NEXT: vmv1r.v v16, v0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1029,8 +1437,8 @@ define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va,
define <vscale x 8 x double> @vp_roundtozero_nxv8f64_unmasked(<vscale x 8 x double> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_roundtozero_nxv8f64_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI31_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI31_0)(a1)
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, fa5
@@ -1063,8 +1471,8 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v6, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI32_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI44_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1085,10 +1493,10 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8, v0.t
@@ -1116,8 +1524,8 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: lui a3, %hi(.LCPI33_0)
-; CHECK-NEXT: fld fa5, %lo(.LCPI33_0)(a3)
+; CHECK-NEXT: lui a3, %hi(.LCPI45_0)
+; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a3)
; CHECK-NEXT: sltu a3, a0, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
@@ -1130,10 +1538,10 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v8
; CHECK-NEXT: vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index e9b6126323de02..5ba4efa8458c79 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1,12 +1,1664 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, metadata, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i1> @fcmp_oeq_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v9, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_oeq_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v10, v8, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_oeq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ogt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ogt_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ogt_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ogt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ogt_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_oge_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oge_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_oge_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oge_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_oge_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oge_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v10, v8, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_olt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_olt_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_olt_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_olt_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_olt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_olt_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ole_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ole_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v9, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ole_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ole_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v10, v8, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ole_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ole_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_one_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_one_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vmflt.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"one", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_one_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_one_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"one", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_one_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v9, v8, v10, v0.t
+; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"one", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ord_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ord_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ord", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ord_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ord_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ord", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ord_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v10, v10, v0.t
+; CHECK-NEXT: vmand.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ord", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ueq_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ueq_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vmflt.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ueq", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ueq_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ueq", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ueq_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v9, v8, v10, v0.t
+; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t
+; CHECK-NEXT: vmnor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ueq", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ugt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ugt_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ugt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ugt_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v10, v8, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ugt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ugt_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ugt_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ugt", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_uge_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uge_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"uge", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_uge_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uge_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"uge", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_uge_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uge_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"uge", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ult_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ult_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ult", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ult_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ult_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ult", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ult_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ult_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v10, v8, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ult", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ule_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ule_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ule", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ule_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ule_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ule", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_ule_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ule_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v10, v8, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"ule", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_une_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_une_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v9, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_une_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_une_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v10, v8, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_une_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_une_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_uno_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uno_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"uno", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_uno_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uno_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"uno", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uno_vf_swap_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v10, v10, v0.t
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, metadata !"uno", <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i1> %v
+}
+
+declare <vscale x 3 x i1> @llvm.vp.fcmp.nxv3bf16(<vscale x 3 x bfloat>, <vscale x 3 x bfloat>, metadata, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i1> @fcmp_oeq_vv_nxv3bf16(<vscale x 3 x bfloat> %va, <vscale x 3 x bfloat> %vb, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vv_nxv3bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 3 x i1> @llvm.vp.fcmp.nxv3bf16(<vscale x 3 x bfloat> %va, <vscale x 3 x bfloat> %vb, metadata !"oeq", <vscale x 3 x i1> %m, i32 %evl)
+ ret <vscale x 3 x i1> %v
+}
+
+declare <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, metadata, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"oeq", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"oeq", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_oeq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"oeq", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ogt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ogt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ogt_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ogt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oge_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"oge", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oge_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"oge", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oge_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"oge", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_olt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"olt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_olt_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"olt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_olt_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"olt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ole_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ole", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ole_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ole", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ole_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ole", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_one_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_one_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"one", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_one_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_one_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmflt.vv v9, v16, v12, v0.t
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"one", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_one_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"one", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ord_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v10, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ord", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ord_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v12, v12, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ord", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ord_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v12, v12, v0.t
+; CHECK-NEXT: vmand.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ord", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ueq", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmflt.vv v9, v16, v12, v0.t
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ueq", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ueq_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmflt.vv v9, v12, v16, v0.t
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ueq", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ugt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ugt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ugt_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ugt", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uge_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"uge", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uge_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"uge", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uge_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"uge", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ult_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ult", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ult_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ult", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ult_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ult", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ule_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ule", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ule_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"ule", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_ule_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"ule", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_une_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_une_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"une", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_une_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_une_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v16, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"une", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_une_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_une_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v16, v12, v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"une", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uno_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v10, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"uno", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uno_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v12, v12, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"uno", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_uno_vf_swap_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v12, v12, v0.t
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, metadata !"uno", <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i1> %v
+}
+
+declare <vscale x 64 x i1> @llvm.vp.fcmp.nxv64bf16(<vscale x 64 x bfloat>, <vscale x 64 x bfloat>, metadata, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a3, a1
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a3, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a1, a3, 3
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vl8re16.v v16, (a1)
+; CHECK-NEXT: slli a5, a3, 2
+; CHECK-NEXT: sub a1, a2, a5
+; CHECK-NEXT: sltu a4, a2, a1
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a6, a4, a1
+; CHECK-NEXT: slli a4, a3, 1
+; CHECK-NEXT: sub a1, a6, a4
+; CHECK-NEXT: sltu a7, a6, a1
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: and a7, a7, a1
+; CHECK-NEXT: srli a1, a3, 1
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: add t0, sp, t0
+; CHECK-NEXT: addi t0, t0, 16
+; CHECK-NEXT: vs1r.v v0, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v0, a1
+; CHECK-NEXT: srli a3, a3, 2
+; CHECK-NEXT: addi t0, sp, 16
+; CHECK-NEXT: vs1r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli t0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a3
+; CHECK-NEXT: vl8re16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv t0, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add t0, t0, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, t0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: vmv4r.v v16, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv t0, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, t0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: vsetvli zero, a7, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v26, v16, v8, v0.t
+; CHECK-NEXT: bltu a6, a4, .LBB85_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a6, a4
+; CHECK-NEXT: .LBB85_2:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a7, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a7
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a6, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v6, v16, v8, v0.t
+; CHECK-NEXT: add a0, a3, a3
+; CHECK-NEXT: bltu a2, a5, .LBB85_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: mv a2, a5
+; CHECK-NEXT: .LBB85_4:
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v6, v26, a3
+; CHECK-NEXT: sub a5, a2, a4
+; CHECK-NEXT: sltu a6, a2, a5
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a5, a6, a5
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: add a6, sp, a6
+; CHECK-NEXT: addi a6, a6, 16
+; CHECK-NEXT: vl1r.v v8, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT: vmv1r.v v7, v8
+; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a3
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 1
+; CHECK-NEXT: mv a7, a6
+; CHECK-NEXT: slli a6, a6, 3
+; CHECK-NEXT: add a6, a6, a7
+; CHECK-NEXT: add a6, sp, a6
+; CHECK-NEXT: addi a6, a6, 16
+; CHECK-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a6, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 1
+; CHECK-NEXT: mv a7, a6
+; CHECK-NEXT: slli a6, a6, 2
+; CHECK-NEXT: add a6, a6, a7
+; CHECK-NEXT: add a6, sp, a6
+; CHECK-NEXT: addi a6, a6, 16
+; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 1
+; CHECK-NEXT: mv a7, a6
+; CHECK-NEXT: slli a6, a6, 2
+; CHECK-NEXT: add a7, a7, a6
+; CHECK-NEXT: slli a6, a6, 1
+; CHECK-NEXT: add a6, a6, a7
+; CHECK-NEXT: add a6, sp, a6
+; CHECK-NEXT: addi a6, a6, 16
+; CHECK-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 1
+; CHECK-NEXT: mv a7, a6
+; CHECK-NEXT: slli a6, a6, 2
+; CHECK-NEXT: add a6, a6, a7
+; CHECK-NEXT: add a6, sp, a6
+; CHECK-NEXT: addi a6, a6, 16
+; CHECK-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v5, v16, v8, v0.t
+; CHECK-NEXT: bltu a2, a4, .LBB85_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: mv a2, a4
+; CHECK-NEXT: .LBB85_6:
+; CHECK-NEXT: vsetvli a4, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 2
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v5, a3
+; CHECK-NEXT: add a0, a1, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v6, a1
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
+ ret <vscale x 64 x i1> %v
+}
declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, metadata, <vscale x 1 x i1>, i32)
@@ -2108,10 +3760,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFH-NEXT: and a4, a5, a4
; ZVFH-NEXT: vsetvli zero, a4, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v7, v16, v8, v0.t
-; ZVFH-NEXT: bltu a2, a3, .LBB85_2
+; ZVFH-NEXT: bltu a2, a3, .LBB171_2
; ZVFH-NEXT: # %bb.1:
; ZVFH-NEXT: mv a2, a3
-; ZVFH-NEXT: .LBB85_2:
+; ZVFH-NEXT: .LBB171_2:
; ZVFH-NEXT: vmv1r.v v0, v24
; ZVFH-NEXT: csrr a0, vlenb
; ZVFH-NEXT: slli a0, a0, 3
@@ -2137,14 +3789,18 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a3, 34
-; ZVFHMIN-NEXT: mul a1, a1, a3
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: mv a3, a1
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: add a1, a1, a3
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
; ZVFHMIN-NEXT: vmv8r.v v24, v16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a3, 18
-; ZVFHMIN-NEXT: mul a1, a1, a3
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: mv a3, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a1, a1, a3
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2176,8 +3832,12 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li t0, 26
-; ZVFHMIN-NEXT: mul a0, a0, t0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv t0, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add t0, t0, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, t0
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -2190,18 +3850,20 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: vmv4r.v v16, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li t0, 10
-; ZVFHMIN-NEXT: mul a0, a0, t0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv t0, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, t0
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28
; ZVFHMIN-NEXT: vsetvli zero, a7, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v26, v16, v8, v0.t
-; ZVFHMIN-NEXT: bltu a6, a4, .LBB85_2
+; ZVFHMIN-NEXT: bltu a6, a4, .LBB171_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a6, a4
-; ZVFHMIN-NEXT: .LBB85_2:
+; ZVFHMIN-NEXT: .LBB171_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 1
; ZVFHMIN-NEXT: add a0, sp, a0
@@ -2210,8 +3872,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a7, 10
-; ZVFHMIN-NEXT: mul a0, a0, a7
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a7, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a7
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -2221,10 +3885,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vsetvli zero, a6, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v8, v0.t
; ZVFHMIN-NEXT: add a0, a3, a3
-; ZVFHMIN-NEXT: bltu a2, a5, .LBB85_4
+; ZVFHMIN-NEXT: bltu a2, a5, .LBB171_4
; ZVFHMIN-NEXT: # %bb.3:
; ZVFHMIN-NEXT: mv a2, a5
-; ZVFHMIN-NEXT: .LBB85_4:
+; ZVFHMIN-NEXT: .LBB171_4:
; ZVFHMIN-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslideup.vx v6, v26, a3
; ZVFHMIN-NEXT: sub a5, a2, a4
@@ -2239,43 +3903,57 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
; ZVFHMIN-NEXT: csrr a6, vlenb
-; ZVFHMIN-NEXT: li a7, 18
-; ZVFHMIN-NEXT: mul a6, a6, a7
+; ZVFHMIN-NEXT: slli a6, a6, 1
+; ZVFHMIN-NEXT: mv a7, a6
+; ZVFHMIN-NEXT: slli a6, a6, 3
+; ZVFHMIN-NEXT: add a6, a6, a7
; ZVFHMIN-NEXT: add a6, sp, a6
; ZVFHMIN-NEXT: addi a6, a6, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a6) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli a6, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
; ZVFHMIN-NEXT: csrr a6, vlenb
-; ZVFHMIN-NEXT: li a7, 10
-; ZVFHMIN-NEXT: mul a6, a6, a7
+; ZVFHMIN-NEXT: slli a6, a6, 1
+; ZVFHMIN-NEXT: mv a7, a6
+; ZVFHMIN-NEXT: slli a6, a6, 2
+; ZVFHMIN-NEXT: add a6, a6, a7
; ZVFHMIN-NEXT: add a6, sp, a6
; ZVFHMIN-NEXT: addi a6, a6, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a6, vlenb
-; ZVFHMIN-NEXT: li a7, 26
-; ZVFHMIN-NEXT: mul a6, a6, a7
+; ZVFHMIN-NEXT: slli a6, a6, 1
+; ZVFHMIN-NEXT: mv a7, a6
+; ZVFHMIN-NEXT: slli a6, a6, 2
+; ZVFHMIN-NEXT: add a7, a7, a6
+; ZVFHMIN-NEXT: slli a6, a6, 1
+; ZVFHMIN-NEXT: add a6, a6, a7
; ZVFHMIN-NEXT: add a6, sp, a6
; ZVFHMIN-NEXT: addi a6, a6, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: csrr a6, vlenb
-; ZVFHMIN-NEXT: li a7, 10
-; ZVFHMIN-NEXT: mul a6, a6, a7
+; ZVFHMIN-NEXT: slli a6, a6, 1
+; ZVFHMIN-NEXT: mv a7, a6
+; ZVFHMIN-NEXT: slli a6, a6, 2
+; ZVFHMIN-NEXT: add a6, a6, a7
; ZVFHMIN-NEXT: add a6, sp, a6
; ZVFHMIN-NEXT: addi a6, a6, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a6) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v5, v16, v8, v0.t
-; ZVFHMIN-NEXT: bltu a2, a4, .LBB85_6
+; ZVFHMIN-NEXT: bltu a2, a4, .LBB171_6
; ZVFHMIN-NEXT: # %bb.5:
; ZVFHMIN-NEXT: mv a2, a4
-; ZVFHMIN-NEXT: .LBB85_6:
+; ZVFHMIN-NEXT: .LBB171_6:
; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 26
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 2
+; ZVFHMIN-NEXT: add a5, a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -2290,8 +3968,10 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
; ZVFHMIN-NEXT: vslideup.vx v8, v6, a1
; ZVFHMIN-NEXT: vmv.v.v v0, v8
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 34
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -3377,163 +5057,6 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f64(<vscale x 8 x double> %va, do
declare <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, metadata, <vscale x 32 x i1>, i32)
define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fcmp_oeq_vv_nxv32f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 48
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; CHECK-NEXT: vmv1r.v v24, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 40
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: slli t0, a4, 3
-; CHECK-NEXT: slli a1, a4, 5
-; CHECK-NEXT: sub t1, a1, t0
-; CHECK-NEXT: srli a1, a4, 2
-; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v7, v0, a1
-; CHECK-NEXT: srli a3, a4, 3
-; CHECK-NEXT: add a5, a2, t0
-; CHECK-NEXT: vl8re64.v v8, (a5)
-; CHECK-NEXT: slli t3, a4, 4
-; CHECK-NEXT: slli a5, a4, 1
-; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: mv a7, a6
-; CHECK-NEXT: bltu a6, a5, .LBB171_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a7, a5
-; CHECK-NEXT: .LBB171_2:
-; CHECK-NEXT: add t2, a0, t0
-; CHECK-NEXT: add t1, a2, t1
-; CHECK-NEXT: add t0, a2, t3
-; CHECK-NEXT: vl8re64.v v16, (a2)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: sub a2, a7, a4
-; CHECK-NEXT: sltu t3, a7, a2
-; CHECK-NEXT: addi t3, t3, -1
-; CHECK-NEXT: and a2, t3, a2
-; CHECK-NEXT: csrr t3, vlenb
-; CHECK-NEXT: slli t3, t3, 5
-; CHECK-NEXT: add t3, sp, t3
-; CHECK-NEXT: addi t3, t3, 16
-; CHECK-NEXT: vl8r.v v16, (t3) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v6, v16, v8, v0.t
-; CHECK-NEXT: bltu a7, a4, .LBB171_4
-; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv a7, a4
-; CHECK-NEXT: .LBB171_4:
-; CHECK-NEXT: vl8re64.v v8, (t2)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 5
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v8, (t1)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li t1, 24
-; CHECK-NEXT: mul a2, a2, t1
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v18, v7, a3
-; CHECK-NEXT: vl8re64.v v8, (t0)
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT: vl8re64.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 40
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v17, v24, v8, v0.t
-; CHECK-NEXT: add a2, a3, a3
-; CHECK-NEXT: sub a0, a6, a5
-; CHECK-NEXT: sltu a5, a6, a0
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a0, a5, a0
-; CHECK-NEXT: vsetvli zero, a2, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vx v17, v6, a3
-; CHECK-NEXT: mv a2, a0
-; CHECK-NEXT: bltu a0, a4, .LBB171_6
-; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a2, a4
-; CHECK-NEXT: .LBB171_6:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 3
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: addi a5, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v16, v24, v8, v0.t
-; CHECK-NEXT: sub a2, a0, a4
-; CHECK-NEXT: sltu a0, a0, a2
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a2
-; CHECK-NEXT: vmv1r.v v0, v18
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 5
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a4, 24
-; CHECK-NEXT: mul a2, a2, a4
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v18, v24, v8, v0.t
-; CHECK-NEXT: add a0, a1, a3
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
-; CHECK-NEXT: vslideup.vx v17, v16, a1
-; CHECK-NEXT: slli a0, a3, 1
-; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v17, v18, a0
-; CHECK-NEXT: vmv1r.v v0, v17
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 48
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
%v = call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, metadata !"oeq", <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x i1> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index c2c977bec60555..23d73481aed2d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -1,16 +1,1162 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64
; FIXME: The scalar/vector operations ('fv' tests) should swap operands and
; condition codes accordingly in order to generate a 'vf' instruction.
+define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp oeq <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp oeq <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oeq_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_oeq_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp oeq <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_oeq_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp oeq <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_oeq_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp oeq <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ogt <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ogt <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ogt_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ogt <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ogt_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ogt <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ogt_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ogt <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_oge_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp oge <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_oge_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp oge <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oge_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_oge_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp oge <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_oge_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp oge <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_oge_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_oge_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp oge <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_olt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp olt <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_olt_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp olt <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_olt_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_olt_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp olt <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_olt_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp olt <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_olt_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_olt_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp olt <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ole_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ole <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ole_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ole <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ole_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ole_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ole <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ole_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ole <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ole_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ole_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ole <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_one_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_one_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmflt.vv v9, v12, v16
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %vc = fcmp one <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_one_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_one_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16
+; CHECK-NEXT: vmflt.vv v9, v16, v12
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp one <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_one_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_one_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmflt.vv v9, v12, v16
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp one <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_one_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_one_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp one <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_one_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_one_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp one <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ord_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v10, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vmand.mm v0, v8, v10
+; CHECK-NEXT: ret
+ %vc = fcmp ord <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ord_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v12, v12
+; CHECK-NEXT: vmand.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ord <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ord_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ord_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v12, v12
+; CHECK-NEXT: vmand.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ord <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ord_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v10, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vmand.mm v0, v8, v10
+; CHECK-NEXT: ret
+ %vc = fcmp ord <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ord_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v8, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v9, v12, v12
+; CHECK-NEXT: vmand.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ord <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmflt.vv v9, v12, v16
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %vc = fcmp ueq <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16
+; CHECK-NEXT: vmflt.vv v9, v16, v12
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ueq <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ueq_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmflt.vv v9, v12, v16
+; CHECK-NEXT: vmnor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ueq <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ueq_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ueq <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ueq_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ueq <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %vc = fcmp ugt <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v12, v16
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ugt <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ugt_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ugt <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ugt_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ugt <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ugt_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ugt <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_uge_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %vc = fcmp uge <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_uge_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp uge <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uge_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_uge_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp uge <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_uge_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp uge <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uge_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_uge_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp uge <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ult_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %vc = fcmp ult <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ult_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ult <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ult_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ult_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v8, v12, v16
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ult <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ult_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ult <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ult_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ult_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ult <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_ule_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %vc = fcmp ule <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ule_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v16, v12
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ule <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ule_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_ule_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v8, v12, v16
+; CHECK-NEXT: vmnot.m v0, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ule <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_ule_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp ule <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_ule_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_ule_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfle.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp ule <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_une_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_une_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp une <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_une_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_une_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp une <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_une_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_une_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp une <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_une_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_une_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v16, v12
+; CHECK-NEXT: ret
+ %vc = fcmp une <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_une_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_une_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp une <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: fcmp_uno_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v10, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12
+; CHECK-NEXT: vmor.mm v0, v8, v10
+; CHECK-NEXT: ret
+ %vc = fcmp uno <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_uno_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v12, v12
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp uno <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uno_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: fcmp_uno_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v12, v12
+; CHECK-NEXT: vmor.mm v0, v9, v8
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp uno <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vv_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) #0 {
+; CHECK-LABEL: fcmp_uno_vv_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v10, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12
+; CHECK-NEXT: vmor.mm v0, v8, v10
+; CHECK-NEXT: ret
+ %vc = fcmp uno <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x i1> %vc
+}
+
+define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va, bfloat %b) #0 {
+; CHECK-LABEL: fcmp_uno_vf_nxv8bf16_nonans:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v8, v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfne.vv v9, v12, v12
+; CHECK-NEXT: vmor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fcmp uno <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x i1> %vc
+}
+
define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb) {
; ZVFH-LABEL: fcmp_oeq_vv_nxv8f16:
; ZVFH: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
index af80e627b43fa1..53be153f8ff2da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
@@ -1,12 +1,239 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfadd_vv_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfadd_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfadd_vv_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfadd_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfadd_vv_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfadd_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfadd_vv_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfadd_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfadd_vv_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfadd_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfadd_vv_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfadd_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
declare <vscale x 1 x half> @llvm.experimental.constrained.fadd.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, metadata, metadata)
define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) strictfp {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index 8f21e326e68790..c3c0958f7096d9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -1,12 +1,252 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
+; CHECK-LABEL: vfadd_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fadd <vscale x 1 x bfloat> %va, %vb
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = fadd <vscale x 1 x bfloat> %va, %splat
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
+; CHECK-LABEL: vfadd_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fadd <vscale x 2 x bfloat> %va, %vb
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = fadd <vscale x 2 x bfloat> %va, %splat
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
+; CHECK-LABEL: vfadd_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %vc = fadd <vscale x 4 x bfloat> %va, %vb
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = fadd <vscale x 4 x bfloat> %va, %splat
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: vfadd_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vc = fadd <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fadd <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fadd <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
+; CHECK-LABEL: vfadd_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %vc = fadd <vscale x 16 x bfloat> %va, %vb
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = fadd <vscale x 16 x bfloat> %va, %splat
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
+; CHECK-LABEL: vfadd_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %vc = fadd <vscale x 32 x bfloat> %va, %vb
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfadd_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = fadd <vscale x 32 x bfloat> %va, %splat
+ ret <vscale x 32 x bfloat> %vc
+}
define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) {
; ZVFH-LABEL: vfadd_vv_nxv1f16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 395f1a7c382bff..b3de904d20622b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -1,13 +1,660 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv1bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v8, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfadd.vv v9, v10, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v12, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfadd.vv v10, v10, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v16, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v12, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfadd.vv v12, v12, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB22_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB23_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB23_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v8
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 3
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB24_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB24_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vmv8r.v v16, v8
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB25_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB25_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfadd.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -564,10 +1211,10 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB48_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB22_2:
+; ZVFHMIN-NEXT: .LBB48_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -621,10 +1268,10 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB49_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB23_2:
+; ZVFHMIN-NEXT: .LBB49_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -699,10 +1346,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFHMIN-NEXT: vfadd.vv v16, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB24_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB50_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB24_2:
+; ZVFHMIN-NEXT: .LBB50_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -780,10 +1427,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB25_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB51_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB25_2:
+; ZVFHMIN-NEXT: .LBB51_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll
index f2af8ac3b02d4c..c97278480f1a81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll
@@ -1,18 +1,51 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
-define <vscale x 2 x i1> @isnan_nxv2f16(<vscale x 2 x half> %x) {
-; CHECK-LABEL: isnan_nxv2f16:
+define <vscale x 2 x i1> @isnan_nxv2bf16(<vscale x 2 x bfloat> %x) {
+; CHECK-LABEL: isnan_nxv2bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfclass.v v8, v8
-; CHECK-NEXT: li a0, 768
-; CHECK-NEXT: vand.vx v8, v8, a0
-; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: addi a1, a0, -1
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: vmsgt.vx v0, v8, a0
; CHECK-NEXT: ret
+ %1 = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2bf16(<vscale x 2 x bfloat> %x, i32 3) ; nan
+ ret <vscale x 2 x i1> %1
+}
+
+define <vscale x 2 x i1> @isnan_nxv2f16(<vscale x 2 x half> %x) {
+; ZVFH-LABEL: isnan_nxv2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT: vfclass.v v8, v8
+; ZVFH-NEXT: li a0, 768
+; ZVFH-NEXT: vand.vx v8, v8, a0
+; ZVFH-NEXT: vmsne.vi v0, v8, 0
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: isnan_nxv2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: lui a0, 8
+; ZVFHMIN-NEXT: addi a0, a0, -1
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vand.vx v8, v8, a0
+; ZVFHMIN-NEXT: li a0, 31
+; ZVFHMIN-NEXT: slli a0, a0, 10
+; ZVFHMIN-NEXT: vmsgt.vx v0, v8, a0
+; ZVFHMIN-NEXT: ret
%1 = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f16(<vscale x 2 x half> %x, i32 3) ; nan
ret <vscale x 2 x i1> %1
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
index 69095a0b21bb09..aa59732e1e1e52 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
@@ -1,12 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfdiv_vv_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfdiv_vv_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfdiv_vv_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfdiv_vv_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfdiv_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fdiv.nxv8bf16(<vscale x 8 x bfloat> %splat, <vscale x 8 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfdiv_vv_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfdiv_vv_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfdiv_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
declare <vscale x 1 x half> @llvm.experimental.constrained.fdiv.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, metadata, metadata)
define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) strictfp {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
index 9f5434dd34727d..f7db2be35d720b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
@@ -1,13 +1,249 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
+; CHECK-LABEL: vfdiv_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fdiv <vscale x 1 x bfloat> %va, %vb
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = fdiv <vscale x 1 x bfloat> %va, %splat
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
+; CHECK-LABEL: vfdiv_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fdiv <vscale x 2 x bfloat> %va, %vb
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = fdiv <vscale x 2 x bfloat> %va, %splat
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
+; CHECK-LABEL: vfdiv_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %vc = fdiv <vscale x 4 x bfloat> %va, %vb
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = fdiv <vscale x 4 x bfloat> %va, %splat
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: vfdiv_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vc = fdiv <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fdiv <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfdiv_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fdiv <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
+; CHECK-LABEL: vfdiv_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %vc = fdiv <vscale x 16 x bfloat> %va, %vb
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = fdiv <vscale x 16 x bfloat> %va, %splat
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
+; CHECK-LABEL: vfdiv_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %vc = fdiv <vscale x 32 x bfloat> %va, %vb
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfdiv_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = fdiv <vscale x 32 x bfloat> %va, %splat
+ ret <vscale x 32 x bfloat> %vc
+}
+
define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) {
; ZVFH-LABEL: vfdiv_vv_nxv1f16:
; ZVFH: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 52e2a9535ef603..aa39fe5b5ec851 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -1,13 +1,622 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfdiv_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfdiv_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfdiv_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfdiv.vv v9, v10, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v12, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v10, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfdiv_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfdiv.vv v10, v10, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v16, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v12, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfdiv_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfdiv.vv v12, v12, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB20_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB20_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB21_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB21_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v8
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 3
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB22_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vf_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vmv8r.v v16, v8
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB23_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB23_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -514,10 +1123,10 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB20_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB20_2:
+; ZVFHMIN-NEXT: .LBB44_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -571,10 +1180,10 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB21_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB21_2:
+; ZVFHMIN-NEXT: .LBB45_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -649,10 +1258,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFHMIN-NEXT: vfdiv.vv v16, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB22_2:
+; ZVFHMIN-NEXT: .LBB46_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -730,10 +1339,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB23_2:
+; ZVFHMIN-NEXT: .LBB47_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index b6bb0371121b4f..baecb7bb7d2483 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -1,15 +1,1429 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
; RUN: --check-prefixes=CHECK,ZVFHMIN
+declare <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfma_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v10, v11, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfma_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v10, v11
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x bfloat> %c, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x bfloat> %vc, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x bfloat> %vc, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv1bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v9, v8, v11, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vc, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv1bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fma.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vc, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfma_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v10, v11, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfma_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v10, v11
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x bfloat> %vc, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %vc, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_commute(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x bfloat> %vc, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv2bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v9, v8, v11, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vc, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %vc, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_unmasked_commute(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv2bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fma.nxv2bf16(<vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vc, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfma_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v14, v10, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfma_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v14, v10, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x bfloat> %vc, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v14, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %vc, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_commute(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x bfloat> %vc, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv4bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v14, v8, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vc, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v14, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %vc, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_unmasked_commute(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv4bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v14, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fma.nxv4bf16(<vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vc, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vfma_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v20, v12, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfma_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v20, v12, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x bfloat> %vc, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v20, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %vc, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_commute(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x bfloat> %vc, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv8bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v20, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vc, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v20, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %vc, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_unmasked_commute(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv8bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v20, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fma.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vc, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v0, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v4, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_commute(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv16bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v4, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %vb, <vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a1
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v0, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked_commute(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a1
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v0, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %vb, <vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x bfloat> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: slli a2, a2, 2
+; CHECK-NEXT: add a3, a3, a2
+; CHECK-NEXT: slli a2, a2, 2
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: vl8re16.v v0, (a0)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a0, a2, 1
+; CHECK-NEXT: sub a3, a1, a0
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v24, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs1r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a4, a4, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: vmv4r.v v24, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 5
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 4
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a1, a0, .LBB30_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: .LBB30_2:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a2, a2, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a2, a0, 5
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a2, a0, 4
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a2, a0, 5
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a2, a0, 5
+; CHECK-NEXT: add a0, a2, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x bfloat> %c, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x bfloat> %c, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 5
+; CHECK-NEXT: sub sp, sp, a2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v16
+; CHECK-NEXT: vl8re16.v v16, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a0, a2, 1
+; CHECK-NEXT: sub a3, a1, a0
+; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v7
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v7, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a1, a0, .LBB31_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: .LBB31_2:
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vfmacc.vv v0, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x bfloat> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v24, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a4, a4, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 4
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 5
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv32bf16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v24, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a4, a4, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 4
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 5
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 4
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; CHECK-NEXT: vmv.v.v v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a1, a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v24, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmset.m v7
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v7, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8
+; CHECK-NEXT: bltu a0, a1, .LBB34_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB34_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v0, v24, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v16, v0
+; CHECK-NEXT: vmv8r.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, i32 zeroext %evl) {
+; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v24, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmset.m v7
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v7, a2
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v8, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB35_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB35_2:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v0, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
declare <vscale x 1 x half> @llvm.vp.fma.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfma_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x half> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -833,8 +2247,12 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 42
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a3, a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
; ZVFHMIN-NEXT: vmv1r.v v24, v0
@@ -856,8 +2274,11 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs1r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 25
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a4, a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -897,13 +2318,16 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB30_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB66_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB30_2:
+; ZVFHMIN-NEXT: .LBB66_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a2, a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -957,8 +2381,12 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 42
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a1, a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -985,8 +2413,10 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vmv8r.v v24, v16
; ZVFHMIN-NEXT: vl8re16.v v16, (a0)
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -1017,8 +2447,10 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1029,10 +2461,10 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB31_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB67_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB31_2:
+; ZVFHMIN-NEXT: .LBB67_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
@@ -1048,8 +2480,10 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1081,8 +2515,12 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 42
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
; ZVFHMIN-NEXT: fmv.x.h a1, fa0
@@ -1109,8 +2547,11 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 25
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a4, a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
@@ -1146,13 +2587,16 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB32_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB68_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB32_2:
+; ZVFHMIN-NEXT: .LBB68_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -1170,8 +2614,11 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -1193,8 +2640,11 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1206,8 +2656,12 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 42
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a1, a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -1229,8 +2683,12 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 42
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
; ZVFHMIN-NEXT: fmv.x.h a1, fa0
@@ -1257,8 +2715,11 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 25
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a4, a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
@@ -1294,13 +2755,16 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB33_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB69_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB33_2:
+; ZVFHMIN-NEXT: .LBB69_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1325,8 +2789,11 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v0
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -1341,8 +2808,11 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -1354,8 +2824,12 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 42
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a1, a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -1384,8 +2858,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -1415,8 +2891,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1427,10 +2905,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB34_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB70_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB34_2:
+; ZVFHMIN-NEXT: .LBB70_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -1446,8 +2924,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -1489,8 +2969,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -1520,8 +3002,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1532,10 +3016,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB35_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB71_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB35_2:
+; ZVFHMIN-NEXT: .LBB71_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -1551,8 +3035,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -2250,14 +3736,18 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 40
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a3, a1
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a3, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -2294,8 +3784,10 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: and a0, a7, a6
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a2, a2, a3
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a2, a2, a3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -2307,15 +3799,17 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a2, 24
-; CHECK-NEXT: mul a0, a0, a2
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: bltu a4, a1, .LBB92_2
+; CHECK-NEXT: bltu a4, a1, .LBB128_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a1
-; CHECK-NEXT: .LBB92_2:
+; CHECK-NEXT: .LBB128_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 5
@@ -2333,14 +3827,18 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 40
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -2354,8 +3852,10 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a3, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a1, vlenb
@@ -2389,10 +3889,10 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v16, v8, v24
-; CHECK-NEXT: bltu a4, a1, .LBB93_2
+; CHECK-NEXT: bltu a4, a1, .LBB129_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a1
-; CHECK-NEXT: .LBB93_2:
+; CHECK-NEXT: .LBB129_2:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
@@ -2404,8 +3904,10 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: vfmadd.vv v0, v24, v8
; CHECK-NEXT: vmv.v.v v8, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -7161,8 +8663,12 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 42
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a3, a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
@@ -7183,8 +8689,11 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a2, a0, 1
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: li a4, 25
-; ZVFHMIN-NEXT: mul a3, a3, a4
+; ZVFHMIN-NEXT: mv a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a4, a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 1
+; ZVFHMIN-NEXT: add a3, a3, a4
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -7197,10 +8706,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: bltu a1, a2, .LBB244_2
+; ZVFHMIN-NEXT: bltu a1, a2, .LBB280_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB244_2:
+; ZVFHMIN-NEXT: .LBB280_2:
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: csrr a4, vlenb
; ZVFHMIN-NEXT: slli a4, a4, 3
@@ -7230,8 +8739,11 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: addi a3, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: li a4, 25
-; ZVFHMIN-NEXT: mul a3, a3, a4
+; ZVFHMIN-NEXT: mv a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a4, a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 1
+; ZVFHMIN-NEXT: add a3, a3, a4
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -7258,8 +8770,11 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a2, a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -7271,8 +8786,11 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a2, a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -7284,8 +8802,12 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 42
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a1, a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -7307,8 +8829,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 40
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
@@ -7316,8 +8840,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v0, v24, a0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
@@ -7369,8 +8895,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 24
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
@@ -7380,10 +8908,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB245_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB281_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB245_2:
+; ZVFHMIN-NEXT: .LBB281_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add a0, sp, a0
@@ -7404,8 +8932,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vfncvt.f.f.w v0, v24
; ZVFHMIN-NEXT: vmv8r.v v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 40
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -7433,8 +8963,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -7456,15 +8988,17 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: mv a3, a0
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB246_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB282_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB246_2:
+; ZVFHMIN-NEXT: .LBB282_2:
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vmv4r.v v4, v12
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -7505,8 +9039,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -7559,8 +9095,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -7582,15 +9120,17 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: mv a3, a0
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB247_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB283_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB247_2:
+; ZVFHMIN-NEXT: .LBB283_2:
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vmv4r.v v4, v12
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -7626,8 +9166,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -7685,8 +9227,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -7720,8 +9264,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -7743,10 +9289,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB248_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB284_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB248_2:
+; ZVFHMIN-NEXT: .LBB284_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -7754,8 +9300,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -7803,8 +9351,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v8, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -7838,8 +9388,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -7861,10 +9413,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB249_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB285_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB249_2:
+; ZVFHMIN-NEXT: .LBB285_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -7872,8 +9424,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -7919,8 +9473,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -7945,13 +9501,15 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a2, .LBB250_2
+; ZVFHMIN-NEXT: bltu a1, a2, .LBB286_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB250_2:
+; ZVFHMIN-NEXT: .LBB286_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -7986,8 +9544,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -8029,8 +9589,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 40
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
@@ -8061,15 +9623,17 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a2, .LBB251_2
+; ZVFHMIN-NEXT: bltu a1, a2, .LBB287_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB251_2:
+; ZVFHMIN-NEXT: .LBB287_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
; ZVFHMIN-NEXT: slli a4, a4, 5
; ZVFHMIN-NEXT: add a4, sp, a4
@@ -8077,8 +9641,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -8094,8 +9660,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a3, a3, a4
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: mv a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 1
+; ZVFHMIN-NEXT: add a3, a3, a4
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
@@ -8125,8 +9693,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -8143,8 +9713,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 40
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -8171,8 +9743,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 24
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -8209,8 +9783,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -8238,13 +9814,15 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB252_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB288_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB252_2:
+; ZVFHMIN-NEXT: .LBB288_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -8287,8 +9865,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 24
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -8325,8 +9905,10 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -8354,13 +9936,15 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB253_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB289_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB253_2:
+; ZVFHMIN-NEXT: .LBB289_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -8398,8 +9982,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 40
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; ZVFHMIN-NEXT: fmv.x.h a1, fa0
@@ -8422,8 +10008,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
@@ -8434,10 +10022,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB254_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB290_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB254_2:
+; ZVFHMIN-NEXT: .LBB290_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
; ZVFHMIN-NEXT: slli a4, a4, 5
; ZVFHMIN-NEXT: add a4, sp, a4
@@ -8445,8 +10033,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -8467,8 +10057,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a3, a3, a4
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: mv a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 1
+; ZVFHMIN-NEXT: add a3, a3, a4
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -8493,8 +10085,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -8514,8 +10108,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 40
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -8546,8 +10142,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -8572,13 +10170,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv4r.v v4, v12
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB255_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB291_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB255_2:
+; ZVFHMIN-NEXT: .LBB291_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -8613,8 +10213,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -8674,8 +10276,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -8711,8 +10315,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -8740,13 +10346,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB256_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB256_2:
+; ZVFHMIN-NEXT: .LBB292_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -8793,8 +10401,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -8830,8 +10440,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload
@@ -8860,13 +10472,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB257_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB293_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB257_2:
+; ZVFHMIN-NEXT: .LBB293_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -8910,8 +10524,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -8939,13 +10555,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vmv4r.v v4, v12
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB258_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB294_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB258_2:
+; ZVFHMIN-NEXT: .LBB294_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -8984,8 +10602,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -9032,8 +10652,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -9063,13 +10685,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB259_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB295_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB259_2:
+; ZVFHMIN-NEXT: .LBB295_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -9109,8 +10733,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -9160,8 +10786,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -9200,8 +10828,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload
@@ -9230,13 +10860,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB260_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB296_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB260_2:
+; ZVFHMIN-NEXT: .LBB296_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -9280,8 +10912,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -9320,8 +10954,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -9349,13 +10985,15 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB261_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB297_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB261_2:
+; ZVFHMIN-NEXT: .LBB297_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -9402,8 +11040,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -9428,13 +11068,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a2, .LBB262_2
+; ZVFHMIN-NEXT: bltu a1, a2, .LBB298_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB262_2:
+; ZVFHMIN-NEXT: .LBB298_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -9469,8 +11111,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a0
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -9512,8 +11156,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 40
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 2
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
@@ -9544,15 +11190,17 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a2, .LBB263_2
+; ZVFHMIN-NEXT: bltu a1, a2, .LBB299_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB263_2:
+; ZVFHMIN-NEXT: .LBB299_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
; ZVFHMIN-NEXT: slli a4, a4, 5
; ZVFHMIN-NEXT: add a4, sp, a4
@@ -9560,8 +11208,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -9577,8 +11227,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v4
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a3, a3, a4
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: mv a4, a3
+; ZVFHMIN-NEXT: slli a3, a3, 1
+; ZVFHMIN-NEXT: add a3, a3, a4
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
@@ -9608,8 +11260,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -9626,8 +11280,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 40
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -9654,8 +11310,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 24
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -9692,8 +11350,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -9721,13 +11381,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB264_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB300_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB264_2:
+; ZVFHMIN-NEXT: .LBB300_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -9770,8 +11432,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: sub sp, sp, a2
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a3, 24
-; ZVFHMIN-NEXT: mul a2, a2, a3
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a3, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -9808,8 +11472,10 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -9837,13 +11503,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB265_2
+; ZVFHMIN-NEXT: bltu a1, a0, .LBB301_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB265_2:
+; ZVFHMIN-NEXT: .LBB301_2:
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a0, a0, a2
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a2, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a2
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -9881,8 +11549,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 40
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 2
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; ZVFHMIN-NEXT: fmv.x.h a1, fa0
@@ -9907,14 +11577,16 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: mv a3, a0
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB266_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB302_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB266_2:
+; ZVFHMIN-NEXT: .LBB302_2:
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
@@ -9948,8 +11620,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -9984,8 +11658,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 40
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -10020,8 +11696,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -10034,10 +11712,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: mv a3, a0
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB267_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB303_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB267_2:
+; ZVFHMIN-NEXT: .LBB303_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
; ZVFHMIN-NEXT: slli a4, a4, 4
; ZVFHMIN-NEXT: add a4, sp, a4
@@ -10050,8 +11728,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -10087,8 +11767,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -10136,8 +11818,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -10172,8 +11856,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -10195,10 +11881,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB268_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB304_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB268_2:
+; ZVFHMIN-NEXT: .LBB304_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -10206,8 +11892,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -10253,8 +11941,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -10289,8 +11979,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -10307,10 +11999,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB269_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB305_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB269_2:
+; ZVFHMIN-NEXT: .LBB305_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -10323,8 +12015,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -10366,8 +12060,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -10393,13 +12089,15 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: mv a3, a0
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB270_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB306_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB270_2:
+; ZVFHMIN-NEXT: .LBB306_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 24
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -10434,8 +12132,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -10483,13 +12183,18 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 34
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -10519,13 +12224,16 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: addi a3, a3, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: mv a3, a0
-; ZVFHMIN-NEXT: bltu a0, a2, .LBB271_2
+; ZVFHMIN-NEXT: bltu a0, a2, .LBB307_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: .LBB271_2:
+; ZVFHMIN-NEXT: .LBB307_2:
; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: li a5, 25
-; ZVFHMIN-NEXT: mul a4, a4, a5
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: add a5, a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
; ZVFHMIN-NEXT: add a4, sp, a4
; ZVFHMIN-NEXT: addi a4, a4, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -10576,8 +12284,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 25
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a2, a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -10600,8 +12311,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: li a1, 34
-; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: ret
@@ -10629,8 +12342,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: vmv8r.v v24, v16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -10668,8 +12383,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -10686,10 +12403,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB272_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB308_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB272_2:
+; ZVFHMIN-NEXT: .LBB308_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -10702,8 +12419,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -10745,8 +12464,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -10784,8 +12505,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: li a4, 24
-; ZVFHMIN-NEXT: mul a2, a2, a4
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: mv a4, a2
+; ZVFHMIN-NEXT: slli a2, a2, 1
+; ZVFHMIN-NEXT: add a2, a2, a4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
@@ -10807,10 +12530,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB273_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB309_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB273_2:
+; ZVFHMIN-NEXT: .LBB309_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
@@ -10818,8 +12541,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: li a2, 24
-; ZVFHMIN-NEXT: mul a1, a1, a2
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
index 52e438013fdbac..dea411348ce542 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
@@ -1,16 +1,402 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 \
+; RUN: -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 \
+; RUN: -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
; This tests a mix of vfmacc and vfmadd by using different operand orders to
; trigger commuting in TwoAddressInstructionPass.
+define <vscale x 1 x bfloat> @vfmadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc) strictfp {
+; CHECK-LABEL: vfmadd_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v9, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vd = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 1 x bfloat> %vd
+}
+
+define <vscale x 1 x bfloat> @vfmadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, bfloat %c) strictfp {
+; CHECK-LABEL: vfmadd_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vd = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fma.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %splat, <vscale x 1 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 1 x bfloat> %vd
+}
+
+
+define <vscale x 2 x bfloat> @vfmadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %vc) strictfp {
+; CHECK-LABEL: vfmadd_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v10, v9, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %vd = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fma.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vc, <vscale x 2 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 2 x bfloat> %vd
+}
+
+define <vscale x 2 x bfloat> @vfmadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, bfloat %c) strictfp {
+; CHECK-LABEL: vfmadd_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vd = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fma.nxv2bf16(<vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %splat, <vscale x 2 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 2 x bfloat> %vd
+}
+
+
+define <vscale x 4 x bfloat> @vfmadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %vc) strictfp {
+; CHECK-LABEL: vfmadd_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v14, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT: ret
+ %vd = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fma.nxv4bf16(<vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 4 x bfloat> %vd
+}
+
+define <vscale x 4 x bfloat> @vfmadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, bfloat %c) strictfp {
+; CHECK-LABEL: vfmadd_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v14, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vd = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fma.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %splat, <vscale x 4 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 4 x bfloat> %vd
+}
+
+
+define <vscale x 8 x bfloat> @vfmadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %vc) strictfp {
+; CHECK-LABEL: vfmadd_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v20, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vd = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fma.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %vc, <vscale x 8 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 8 x bfloat> %vd
+}
+
+define <vscale x 8 x bfloat> @vfmadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, bfloat %c) strictfp {
+; CHECK-LABEL: vfmadd_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v20, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vd = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fma.nxv8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %splat, <vscale x 8 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 8 x bfloat> %vd
+}
+
+
+define <vscale x 16 x bfloat> @vfmadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x bfloat> %vc) strictfp {
+; CHECK-LABEL: vfmadd_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %vd = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fma.nxv16bf16(<vscale x 16 x bfloat> %vc, <vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 16 x bfloat> %vd
+}
+
+define <vscale x 16 x bfloat> @vfmadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, bfloat %c) strictfp {
+; CHECK-LABEL: vfmadd_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vd = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %splat, <vscale x 16 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 16 x bfloat> %vd
+}
+
+
+define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %vc) strictfp {
+; CHECK-LABEL: vfmadd_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 5
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT: vl8re16.v v0, (a0)
+; CHECK-NEXT: vmv8r.v v24, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vmv8r.v v8, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v0, v16, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v8, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %vd = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fma.nxv32bf16(<vscale x 32 x bfloat> %vc, <vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 32 x bfloat> %vd
+}
+
+define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, bfloat %c) strictfp {
+; CHECK-LABEL: vfmadd_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v0, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v8, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vd = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %splat, <vscale x 32 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 32 x bfloat> %vd
+}
+
declare <vscale x 1 x half> @llvm.experimental.constrained.fma.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, metadata, metadata)
define <vscale x 1 x half> @vfmadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x half> %vc) strictfp {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
index a80a943c2e1dbe..2df2212c43db09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
@@ -1,16 +1,573 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
; This tests a mix of vfmacc and vfmadd by using different operand orders to
; trigger commuting in TwoAddressInstructionPass.
+define <vscale x 1 x bfloat> @vfmadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v10, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vd = call <vscale x 1 x bfloat> @llvm.fma.v1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc)
+ ret <vscale x 1 x bfloat> %vd
+}
+
+define <vscale x 1 x bfloat> @vfmadd_vv_nxv1bf16_commuted(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv1bf16_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vd = call <vscale x 1 x bfloat> @llvm.fma.v1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %vc, <vscale x 1 x bfloat> %va)
+ ret <vscale x 1 x bfloat> %vd
+}
+
+define <vscale x 1 x bfloat> @vfmadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, bfloat %c) {
+; CHECK-LABEL: vfmadd_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vd = call <vscale x 1 x bfloat> @llvm.fma.v1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %splat, <vscale x 1 x bfloat> %vb)
+ ret <vscale x 1 x bfloat> %vd
+}
+
+declare <vscale x 2 x bfloat> @llvm.fma.v2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+
+define <vscale x 2 x bfloat> @vfmadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v12, v9, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vd = call <vscale x 2 x bfloat> @llvm.fma.v2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vc, <vscale x 2 x bfloat> %vb)
+ ret <vscale x 2 x bfloat> %vd
+}
+
+define <vscale x 2 x bfloat> @vfmadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, bfloat %c) {
+; CHECK-LABEL: vfmadd_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmadd.vv v9, v8, v11
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vd = call <vscale x 2 x bfloat> @llvm.fma.v2bf16(<vscale x 2 x bfloat> %vb, <vscale x 2 x bfloat> %splat, <vscale x 2 x bfloat> %va)
+ ret <vscale x 2 x bfloat> %vd
+}
+
+declare <vscale x 4 x bfloat> @llvm.fma.v4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+
+define <vscale x 4 x bfloat> @vfmadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v14, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT: ret
+ %vd = call <vscale x 4 x bfloat> @llvm.fma.v4bf16(<vscale x 4 x bfloat> %vb, <vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vc)
+ ret <vscale x 4 x bfloat> %vd
+}
+
+define <vscale x 4 x bfloat> @vfmadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, bfloat %c) {
+; CHECK-LABEL: vfmadd_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v14, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vd = call <vscale x 4 x bfloat> @llvm.fma.v4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %splat, <vscale x 4 x bfloat> %vb)
+ ret <vscale x 4 x bfloat> %vd
+}
+
+declare <vscale x 8 x bfloat> @llvm.fma.v8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define <vscale x 8 x bfloat> @vfmadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v20, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: ret
+ %vd = call <vscale x 8 x bfloat> @llvm.fma.v8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %vc, <vscale x 8 x bfloat> %va)
+ ret <vscale x 8 x bfloat> %vd
+}
+
+define <vscale x 8 x bfloat> @vfmadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, bfloat %c) {
+; CHECK-LABEL: vfmadd_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmadd.vv v24, v20, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vd = call <vscale x 8 x bfloat> @llvm.fma.v8bf16(<vscale x 8 x bfloat> %vb, <vscale x 8 x bfloat> %splat, <vscale x 8 x bfloat> %va)
+ ret <vscale x 8 x bfloat> %vd
+}
+
+declare <vscale x 16 x bfloat> @llvm.fma.v16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x bfloat>)
+
+define <vscale x 16 x bfloat> @vfmadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x bfloat> %vc) {
+; CHECK-LABEL: vfmadd_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %vd = call <vscale x 16 x bfloat> @llvm.fma.v16bf16(<vscale x 16 x bfloat> %vc, <vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb)
+ ret <vscale x 16 x bfloat> %vd
+}
+
+define <vscale x 16 x bfloat> @vfmadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, bfloat %c) {
+; CHECK-LABEL: vfmadd_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmadd.vv v16, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vd = call <vscale x 16 x bfloat> @llvm.fma.v16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %splat, <vscale x 16 x bfloat> %vb)
+ ret <vscale x 16 x bfloat> %vd
+}
+
+declare <vscale x 32 x bfloat> @llvm.fma.v32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
+
+define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %vc) {
+; ZVFH-LABEL: vfmadd_vv_nxv32bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a1, vlenb
+; ZVFH-NEXT: slli a1, a1, 5
+; ZVFH-NEXT: sub sp, sp, a1
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFH-NEXT: vl8re16.v v0, (a0)
+; ZVFH-NEXT: vmv8r.v v24, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmv8r.v v16, v8
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vmv8r.v v8, v0
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfmadd.vv v0, v16, v24
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfmadd.vv v16, v8, v24
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 5
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmadd_vv_nxv32bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: vl8re16.v v0, (a0)
+; ZVFHMIN-NEXT: vmv8r.v v24, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv8r.v v16, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vmv8r.v v8, v0
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: ret
+ %vd = call <vscale x 32 x bfloat> @llvm.fma.v32bf16(<vscale x 32 x bfloat> %vc, <vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va)
+ ret <vscale x 32 x bfloat> %vd
+}
+
+define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, bfloat %c) {
+; ZVFH-LABEL: vfmadd_vf_nxv32bf16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: addi sp, sp, -16
+; ZVFH-NEXT: .cfi_def_cfa_offset 16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: sub sp, sp, a0
+; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFH-NEXT: vmv8r.v v24, v16
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: fmv.x.h a0, fa0
+; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT: vmv.v.x v16, a0
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v24
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfmadd.vv v0, v16, v24
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 4
+; ZVFH-NEXT: add a0, sp, a0
+; ZVFH-NEXT: addi a0, a0, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT: addi a0, sp, 16
+; ZVFH-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT: vfmadd.vv v24, v8, v16
+; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v24
+; ZVFH-NEXT: csrr a0, vlenb
+; ZVFH-NEXT: slli a0, a0, 3
+; ZVFH-NEXT: mv a1, a0
+; ZVFH-NEXT: slli a0, a0, 1
+; ZVFH-NEXT: add a0, a0, a1
+; ZVFH-NEXT: add sp, sp, a0
+; ZVFH-NEXT: addi sp, sp, 16
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmadd_vf_nxv32bf16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: addi sp, sp, -16
+; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: sub sp, sp, a0
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT: vmv8r.v v24, v16
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a0, fa0
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v16, a0
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v24
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: li a1, 24
+; ZVFHMIN-NEXT: mul a0, a0, a1
+; ZVFHMIN-NEXT: add sp, sp, a0
+; ZVFHMIN-NEXT: addi sp, sp, 16
+; ZVFHMIN-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %c, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vd = call <vscale x 32 x bfloat> @llvm.fma.v32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %splat, <vscale x 32 x bfloat> %va)
+ ret <vscale x 32 x bfloat> %vd
+}
+
declare <vscale x 1 x half> @llvm.fma.v1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>)
define <vscale x 1 x half> @vfmadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x half> %vc) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
index caf37b7a0a12bd..b5604add6d25bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
@@ -1,12 +1,243 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfmax_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv1bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.maxnum.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfmax_nxv1bf16_vf(<vscale x 1 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmax_nxv1bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.maxnum.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %splat)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+
+define <vscale x 2 x bfloat> @vfmax_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv2bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfmax_nxv2bf16_vf(<vscale x 2 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmax_nxv2bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %splat)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+
+define <vscale x 4 x bfloat> @vfmax_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv4bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmax.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfmax_nxv4bf16_vf(<vscale x 4 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmax_nxv4bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmax.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %splat)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define <vscale x 8 x bfloat> @vfmax_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv8bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmax.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfmax_nxv8bf16_vf(<vscale x 8 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmax_nxv8bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmax.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %splat)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.maxnum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>)
+
+define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv16bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.maxnum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vf(<vscale x 16 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmax_nxv16bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.maxnum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %splat)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.maxnum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
+
+define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
+; CHECK-LABEL: vfmax_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.maxnum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vf(<vscale x 32 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmax_nxv32bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.maxnum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %splat)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.maxnum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 7ab999ea4fa7ee..6e38881b4d60fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -1,13 +1,278 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x bfloat> @llvm.vp.maxnum.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.maxnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.maxnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.maxnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.maxnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmax.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.maxnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.maxnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmax.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.maxnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmax.vv v10, v12, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.maxnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.maxnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmax.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.maxnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmax.vv v12, v16, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.maxnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.maxnum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.maxnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.maxnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.maxnum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.maxnum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmax.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.maxnum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.maxnum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -264,10 +529,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -321,10 +586,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
index b47e14f4f26be6..9212ddab5b1ebf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
@@ -1,12 +1,243 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfmin_nxv1bf16_vv(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv1bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.minnum.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfmin_nxv1bf16_vf(<vscale x 1 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmin_nxv1bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.minnum.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %splat)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+
+define <vscale x 2 x bfloat> @vfmin_nxv2bf16_vv(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv2bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfmin_nxv2bf16_vf(<vscale x 2 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmin_nxv2bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %splat)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+
+define <vscale x 4 x bfloat> @vfmin_nxv4bf16_vv(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv4bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmin.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfmin_nxv4bf16_vf(<vscale x 4 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmin_nxv4bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmin.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %splat)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define <vscale x 8 x bfloat> @vfmin_nxv8bf16_vv(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv8bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmin.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfmin_nxv8bf16_vf(<vscale x 8 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmin_nxv8bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmin.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %splat)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.minnum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>)
+
+define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv16bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.minnum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vf(<vscale x 16 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmin_nxv16bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.minnum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %splat)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.minnum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
+
+define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
+; CHECK-LABEL: vfmin_nxv32bf16_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.minnum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vf(<vscale x 32 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: vfmin_nxv32bf16_vf:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.minnum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %splat)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.minnum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index e928df85b5bb56..f1d6b2100ae980 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -1,13 +1,278 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x bfloat> @llvm.vp.minnum.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.minnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.minnum.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.minnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.minnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfmin.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.minnum.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.minnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmin.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.minnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfmin.vv v10, v12, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.minnum.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.minnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmin.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.minnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfmin.vv v12, v16, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.minnum.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.minnum.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.minnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.minnum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.minnum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.minnum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfmin.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.minnum.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.minnum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -264,10 +529,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -321,10 +586,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
index e82fdf065574f9..999b06ba5a5791 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
@@ -1,12 +1,239 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfmul_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfmul_vv_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fmul.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfmul_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfmul_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fmul.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfmul_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfmul_vv_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fmul.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfmul_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfmul_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fmul.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfmul_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfmul_vv_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmul.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fmul.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfmul_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfmul_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmul.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fmul.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfmul_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfmul_vv_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmul.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fmul.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfmul_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfmul_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmul.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fmul.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfmul_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfmul_vv_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fmul.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfmul_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfmul_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fmul.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfmul_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfmul_vv_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fmul.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfmul_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfmul_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fmul.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
declare <vscale x 1 x half> @llvm.experimental.constrained.fmul.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, metadata, metadata)
define <vscale x 1 x half> @vfmul_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) strictfp {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
index 70d664aa50ec4f..2ab04a45c8183d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
@@ -1,12 +1,252 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfmul_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
+; CHECK-LABEL: vfmul_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fmul <vscale x 1 x bfloat> %va, %vb
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfmul_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = fmul <vscale x 1 x bfloat> %va, %splat
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfmul_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
+; CHECK-LABEL: vfmul_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fmul <vscale x 2 x bfloat> %va, %vb
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfmul_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfmul.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = fmul <vscale x 2 x bfloat> %va, %splat
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfmul_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
+; CHECK-LABEL: vfmul_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmul.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %vc = fmul <vscale x 4 x bfloat> %va, %vb
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfmul_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfmul.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = fmul <vscale x 4 x bfloat> %va, %splat
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfmul_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: vfmul_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmul.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vc = fmul <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfmul_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmul.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fmul <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfmul_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfmul.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fmul <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfmul_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
+; CHECK-LABEL: vfmul_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %vc = fmul <vscale x 16 x bfloat> %va, %vb
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfmul_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = fmul <vscale x 16 x bfloat> %va, %splat
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfmul_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
+; CHECK-LABEL: vfmul_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %vc = fmul <vscale x 32 x bfloat> %va, %vb
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfmul_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfmul_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v16, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = fmul <vscale x 32 x bfloat> %va, %splat
+ ret <vscale x 32 x bfloat> %vc
+}
define <vscale x 1 x half> @vfmul_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) {
; ZVFH-LABEL: vfmul_vv_nxv1f16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
index 806b817fd6c4a2..9da1e0a576d5b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
@@ -1,12 +1,110 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfsqrt_nxv1bf16(<vscale x 1 x bfloat> %v) strictfp {
+; CHECK-LABEL: vfsqrt_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %r = call <vscale x 1 x bfloat> @llvm.experimental.constrained.sqrt.nxv1bf16(<vscale x 1 x bfloat> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 1 x bfloat> %r
+}
+
+
+define <vscale x 2 x bfloat> @vfsqrt_nxv2bf16(<vscale x 2 x bfloat> %v) strictfp {
+; CHECK-LABEL: vfsqrt_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %r = call <vscale x 2 x bfloat> @llvm.experimental.constrained.sqrt.nxv2bf16(<vscale x 2 x bfloat> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 2 x bfloat> %r
+}
+
+
+define <vscale x 4 x bfloat> @vfsqrt_nxv4bf16(<vscale x 4 x bfloat> %v) strictfp {
+; CHECK-LABEL: vfsqrt_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfsqrt.v v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x bfloat> @llvm.experimental.constrained.sqrt.nxv4bf16(<vscale x 4 x bfloat> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 4 x bfloat> %r
+}
+
+
+define <vscale x 8 x bfloat> @vfsqrt_nxv8bf16(<vscale x 8 x bfloat> %v) strictfp {
+; CHECK-LABEL: vfsqrt_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsqrt.v v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x bfloat> @llvm.experimental.constrained.sqrt.nxv8bf16(<vscale x 8 x bfloat> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 8 x bfloat> %r
+}
+
+
+define <vscale x 16 x bfloat> @vfsqrt_nxv16bf16(<vscale x 16 x bfloat> %v) strictfp {
+; CHECK-LABEL: vfsqrt_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %r = call <vscale x 16 x bfloat> @llvm.experimental.constrained.sqrt.nxv16bf16(<vscale x 16 x bfloat> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 16 x bfloat> %r
+}
+
+
+define <vscale x 32 x bfloat> @vfsqrt_nxv32bf16(<vscale x 32 x bfloat> %v) strictfp {
+; CHECK-LABEL: vfsqrt_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %r = call <vscale x 32 x bfloat> @llvm.experimental.constrained.sqrt.nxv32bf16(<vscale x 32 x bfloat> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <vscale x 32 x bfloat> %r
+}
declare <vscale x 1 x half> @llvm.experimental.constrained.sqrt.nxv1f16(<vscale x 1 x half>, metadata, metadata)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
index 329a078cd16633..de31a02cd15452 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
@@ -1,12 +1,105 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfsqrt_nxv1bf16(<vscale x 1 x bfloat> %v) {
+; CHECK-LABEL: vfsqrt_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %r = call <vscale x 1 x bfloat> @llvm.sqrt.nxv1bf16(<vscale x 1 x bfloat> %v)
+ ret <vscale x 1 x bfloat> %r
+}
+
+define <vscale x 2 x bfloat> @vfsqrt_nxv2bf16(<vscale x 2 x bfloat> %v) {
+; CHECK-LABEL: vfsqrt_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %r = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> %v)
+ ret <vscale x 2 x bfloat> %r
+}
+
+define <vscale x 4 x bfloat> @vfsqrt_nxv4bf16(<vscale x 4 x bfloat> %v) {
+; CHECK-LABEL: vfsqrt_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfsqrt.v v10, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> %v)
+ ret <vscale x 4 x bfloat> %r
+}
+
+define <vscale x 8 x bfloat> @vfsqrt_nxv8bf16(<vscale x 8 x bfloat> %v) {
+; CHECK-LABEL: vfsqrt_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsqrt.v v12, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> %v)
+ ret <vscale x 8 x bfloat> %r
+}
+
+define <vscale x 16 x bfloat> @vfsqrt_nxv16bf16(<vscale x 16 x bfloat> %v) {
+; CHECK-LABEL: vfsqrt_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %r = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> %v)
+ ret <vscale x 16 x bfloat> %r
+}
+
+define <vscale x 32 x bfloat> @vfsqrt_nxv32bf16(<vscale x 32 x bfloat> %v) {
+; CHECK-LABEL: vfsqrt_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %r = call <vscale x 32 x bfloat> @llvm.sqrt.nxv32bf16(<vscale x 32 x bfloat> %v)
+ ret <vscale x 32 x bfloat> %r
+}
declare <vscale x 1 x half> @llvm.sqrt.nxv1f16(<vscale x 1 x half>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index bd229e0220a4b6..574c2e05263015 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -1,13 +1,236 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x bfloat> @llvm.vp.sqrt.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.sqrt.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfsqrt_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.sqrt.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.sqrt.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.sqrt.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfsqrt_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfsqrt.v v9, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.sqrt.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.sqrt.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsqrt.v v10, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.sqrt.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfsqrt_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsqrt.v v10, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.sqrt.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.sqrt.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfsqrt.v v12, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.sqrt.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfsqrt_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfsqrt.v v12, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.sqrt.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.sqrt.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.sqrt.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfsqrt_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.sqrt.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.sqrt.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v16, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v24, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT: bltu a0, a1, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.sqrt.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vfsqrt_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v16
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v16, a2
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsqrt.v v16, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.sqrt.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.sqrt.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfsqrt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -245,10 +468,10 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: vfsqrt.v v24, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB10_2:
+; ZVFHMIN-NEXT: .LBB22_2:
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: vmv1r.v v0, v16
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -286,10 +509,10 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vfsqrt.v v16, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB11_2:
+; ZVFHMIN-NEXT: .LBB23_2:
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfsqrt.v v16, v16
@@ -537,10 +760,10 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v16, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB32_2
+; CHECK-NEXT: bltu a0, a1, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB32_2:
+; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v8, v8, v0.t
@@ -559,10 +782,10 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64_unmasked(<vscale x 16 x double
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v16, v16
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
+; CHECK-NEXT: bltu a0, a1, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v8, v8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
index 5729dc4875ae18..e40427a305f61c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
@@ -1,12 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfsub_vv_nxv1bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = call <vscale x 1 x bfloat> @llvm.experimental.constrained.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfsub_vv_nxv2bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = call <vscale x 2 x bfloat> @llvm.experimental.constrained.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfsub_vv_nxv4bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = call <vscale x 4 x bfloat> @llvm.experimental.constrained.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfsub_vv_nxv8bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfsub_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = call <vscale x 8 x bfloat> @llvm.experimental.constrained.fsub.nxv8bf16(<vscale x 8 x bfloat> %splat, <vscale x 8 x bfloat> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfsub_vv_nxv16bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = call <vscale x 16 x bfloat> @llvm.experimental.constrained.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
+; CHECK-LABEL: vfsub_vv_nxv32bf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+entry:
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
+; CHECK-LABEL: vfsub_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %splat, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+ ret <vscale x 32 x bfloat> %vc
+}
declare <vscale x 1 x half> @llvm.experimental.constrained.fsub.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, metadata, metadata)
define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) strictfp {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
index bd73398fd04b56..e56cfd9ee4eb1a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
@@ -1,12 +1,252 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
+; CHECK-LABEL: vfsub_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fsub <vscale x 1 x bfloat> %va, %vb
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %vc = fsub <vscale x 1 x bfloat> %va, %splat
+ ret <vscale x 1 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
+; CHECK-LABEL: vfsub_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %vc = fsub <vscale x 2 x bfloat> %va, %vb
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %vc = fsub <vscale x 2 x bfloat> %va, %splat
+ ret <vscale x 2 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
+; CHECK-LABEL: vfsub_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %vc = fsub <vscale x 4 x bfloat> %va, %vb
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v10, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %vc = fsub <vscale x 4 x bfloat> %va, %splat
+ ret <vscale x 4 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
+; CHECK-LABEL: vfsub_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %vc = fsub <vscale x 8 x bfloat> %va, %vb
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v12, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fsub <vscale x 8 x bfloat> %va, %splat
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 8 x bfloat> @vfsub_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_fv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %vc = fsub <vscale x 8 x bfloat> %splat, %va
+ ret <vscale x 8 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
+; CHECK-LABEL: vfsub_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %vc = fsub <vscale x 16 x bfloat> %va, %vb
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %vc = fsub <vscale x 16 x bfloat> %va, %splat
+ ret <vscale x 16 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
+; CHECK-LABEL: vfsub_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v24, v0, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %vc = fsub <vscale x 32 x bfloat> %va, %vb
+ ret <vscale x 32 x bfloat> %vc
+}
+
+define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
+; CHECK-LABEL: vfsub_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a0, fa0
+; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v24, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v0
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: ret
+ %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %vc = fsub <vscale x 32 x bfloat> %va, %splat
+ ret <vscale x 32 x bfloat> %vc
+}
define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb) {
; ZVFH-LABEL: vfsub_vv_nxv1f16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index fda6d0c48d4a6e..449130e59876f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -1,13 +1,622 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfsub_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfsub_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv1bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 1 x bfloat> %v
+}
+
+declare <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfsub_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfsub_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv2bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfsub.vv v9, v10, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 2 x bfloat> %v
+}
+
+declare <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v12, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfsub_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v12, v10
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v10, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfsub_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv4bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfsub.vv v10, v10, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 4 x bfloat> %v
+}
+
+declare <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
+define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v16, v12, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfsub_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v16, v12
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v12, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfsub_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv8bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfsub.vv v12, v12, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 8 x bfloat> %v
+}
+
+declare <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfsub_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv16bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v12, a1
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
+ %v = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 16 x bfloat> %v
+}
+
+declare <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vmv1r.v v7, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB20_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB20_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vsetvli a4, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB21_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB21_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb
+; CHECK-NEXT: vmv8r.v v24, v8
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v16, a1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a4, a2, 3
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB22_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 3
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vf_nxv32bf16_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vmv8r.v v16, v8
+; CHECK-NEXT: fmv.x.h a1, fa0
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a1, a2, 1
+; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: srli a2, a2, 2
+; CHECK-NEXT: vmset.m v24
+; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v24, a2
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vmv4r.v v16, v8
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: bltu a0, a1, .LBB23_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB23_2:
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v16, v16, v24
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
+ %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x bfloat> %v
+}
declare <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -514,10 +1123,10 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB20_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB20_2:
+; ZVFHMIN-NEXT: .LBB44_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -571,10 +1180,10 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB21_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB21_2:
+; ZVFHMIN-NEXT: .LBB45_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -649,10 +1258,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFHMIN-NEXT: vfsub.vv v16, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB22_2:
+; ZVFHMIN-NEXT: .LBB46_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
@@ -730,10 +1339,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16, v0.t
; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB23_2:
+; ZVFHMIN-NEXT: .LBB47_2:
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 7d78fa5a8f3ef2..0f8e74942d58d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -1,12 +1,288 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
-; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
+; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN: --check-prefixes=CHECK,ZVFHMIN
+
+declare bfloat @llvm.vp.reduce.fadd.nxv1bf16(bfloat, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
+
+define bfloat @vpreduce_fadd_nxv1bf16(bfloat %s, <vscale x 1 x bfloat> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v9, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv1bf16(bfloat %s, <vscale x 1 x bfloat> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+define bfloat @vpreduce_ord_fadd_nxv1bf16(bfloat %s, <vscale x 1 x bfloat> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v9, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call bfloat @llvm.vp.reduce.fadd.nxv1bf16(bfloat %s, <vscale x 1 x bfloat> %v, <vscale x 1 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+declare bfloat @llvm.vp.reduce.fadd.nxv2bf16(bfloat, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
+
+define bfloat @vpreduce_fadd_nxv2bf16(bfloat %s, <vscale x 2 x bfloat> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v9, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv2bf16(bfloat %s, <vscale x 2 x bfloat> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+define bfloat @vpreduce_ord_fadd_nxv2bf16(bfloat %s, <vscale x 2 x bfloat> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v9, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call bfloat @llvm.vp.reduce.fadd.nxv2bf16(bfloat %s, <vscale x 2 x bfloat> %v, <vscale x 2 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+declare bfloat @llvm.vp.reduce.fadd.nxv4bf16(bfloat, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
+
+define bfloat @vpreduce_fadd_nxv4bf16(bfloat %s, <vscale x 4 x bfloat> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v10, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv4bf16(bfloat %s, <vscale x 4 x bfloat> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+define bfloat @vpreduce_ord_fadd_nxv4bf16(bfloat %s, <vscale x 4 x bfloat> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v10, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call bfloat @llvm.vp.reduce.fadd.nxv4bf16(bfloat %s, <vscale x 4 x bfloat> %v, <vscale x 4 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+declare bfloat @llvm.vp.reduce.fadd.nxv64bf16(bfloat, <vscale x 64 x bfloat>, <vscale x 64 x i1>, i32)
+
+define bfloat @vpreduce_fadd_nxv64bf16(bfloat %s, <vscale x 64 x bfloat> %v, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_nxv64bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: srli a1, a3, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v7, v0, a1
+; CHECK-NEXT: slli a5, a3, 2
+; CHECK-NEXT: sub a1, a0, a5
+; CHECK-NEXT: sltu a2, a0, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: slli a4, a3, 1
+; CHECK-NEXT: sub a2, a1, a4
+; CHECK-NEXT: sltu a6, a1, a2
+; CHECK-NEXT: bltu a1, a4, .LBB6_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a4
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: bltu a0, a5, .LBB6_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: .LBB6_4:
+; CHECK-NEXT: and a2, a6, a2
+; CHECK-NEXT: sub a5, a0, a4
+; CHECK-NEXT: sltu a6, a0, a5
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a5, a6, a5
+; CHECK-NEXT: srli a3, a3, 2
+; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v6, v0, a3
+; CHECK-NEXT: bltu a0, a4, .LBB6_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: mv a0, a4
+; CHECK-NEXT: .LBB6_6:
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa5, fa5
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a5, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v6
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa5, fa5
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa5, fa5
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v7, a3
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfredusum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call reassoc bfloat @llvm.vp.reduce.fadd.nxv64bf16(bfloat %s, <vscale x 64 x bfloat> %v, <vscale x 64 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
+
+define bfloat @vpreduce_ord_fadd_nxv64bf16(bfloat %s, <vscale x 64 x bfloat> %v, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_nxv64bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: srli a1, a3, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v7, v0, a1
+; CHECK-NEXT: slli a5, a3, 2
+; CHECK-NEXT: sub a1, a0, a5
+; CHECK-NEXT: sltu a2, a0, a1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: and a1, a2, a1
+; CHECK-NEXT: slli a4, a3, 1
+; CHECK-NEXT: sub a2, a1, a4
+; CHECK-NEXT: sltu a6, a1, a2
+; CHECK-NEXT: bltu a1, a4, .LBB7_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a1, a4
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: bltu a0, a5, .LBB7_4
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: .LBB7_4:
+; CHECK-NEXT: and a2, a6, a2
+; CHECK-NEXT: sub a5, a0, a4
+; CHECK-NEXT: sltu a6, a0, a5
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: and a5, a6, a5
+; CHECK-NEXT: srli a3, a3, 2
+; CHECK-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v6, v0, a3
+; CHECK-NEXT: bltu a0, a4, .LBB7_6
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: mv a0, a4
+; CHECK-NEXT: .LBB7_6:
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa5, fa5
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a5, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT: vmv1r.v v0, v6
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa5, fa5
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa5, fa5
+; CHECK-NEXT: fcvt.s.bf16 fa5, fa5
+; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma
+; CHECK-NEXT: vfmv.s.f v8, fa5
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v7, a3
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfredosum.vs v8, v24, v8, v0.t
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fcvt.bf16.s fa0, fa5
+; CHECK-NEXT: ret
+ %r = call bfloat @llvm.vp.reduce.fadd.nxv64bf16(bfloat %s, <vscale x 64 x bfloat> %v, <vscale x 64 x i1> %m, i32 %evl)
+ ret bfloat %r
+}
declare half @llvm.vp.reduce.fadd.nxv1f16(half, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
@@ -184,10 +460,10 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; ZVFH-NEXT: sltu a3, a0, a1
; ZVFH-NEXT: addi a3, a3, -1
; ZVFH-NEXT: and a1, a3, a1
-; ZVFH-NEXT: bltu a0, a2, .LBB6_2
+; ZVFH-NEXT: bltu a0, a2, .LBB14_2
; ZVFH-NEXT: # %bb.1:
; ZVFH-NEXT: mv a0, a2
-; ZVFH-NEXT: .LBB6_2:
+; ZVFH-NEXT: .LBB14_2:
; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfmv.s.f v25, fa0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
@@ -212,15 +488,15 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; ZVFHMIN-NEXT: slli a4, a3, 1
; ZVFHMIN-NEXT: sub a2, a1, a4
; ZVFHMIN-NEXT: sltu a6, a1, a2
-; ZVFHMIN-NEXT: bltu a1, a4, .LBB6_2
+; ZVFHMIN-NEXT: bltu a1, a4, .LBB14_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a4
-; ZVFHMIN-NEXT: .LBB6_2:
+; ZVFHMIN-NEXT: .LBB14_2:
; ZVFHMIN-NEXT: addi a6, a6, -1
-; ZVFHMIN-NEXT: bltu a0, a5, .LBB6_4
+; ZVFHMIN-NEXT: bltu a0, a5, .LBB14_4
; ZVFHMIN-NEXT: # %bb.3:
; ZVFHMIN-NEXT: mv a0, a5
-; ZVFHMIN-NEXT: .LBB6_4:
+; ZVFHMIN-NEXT: .LBB14_4:
; ZVFHMIN-NEXT: and a2, a6, a2
; ZVFHMIN-NEXT: sub a5, a0, a4
; ZVFHMIN-NEXT: sltu a6, a0, a5
@@ -229,10 +505,10 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: bltu a0, a4, .LBB6_6
+; ZVFHMIN-NEXT: bltu a0, a4, .LBB14_6
; ZVFHMIN-NEXT: # %bb.5:
; ZVFHMIN-NEXT: mv a0, a4
-; ZVFHMIN-NEXT: .LBB6_6:
+; ZVFHMIN-NEXT: .LBB14_6:
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
@@ -290,10 +566,10 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; ZVFH-NEXT: sltu a3, a0, a1
; ZVFH-NEXT: addi a3, a3, -1
; ZVFH-NEXT: and a1, a3, a1
-; ZVFH-NEXT: bltu a0, a2, .LBB7_2
+; ZVFH-NEXT: bltu a0, a2, .LBB15_2
; ZVFH-NEXT: # %bb.1:
; ZVFH-NEXT: mv a0, a2
-; ZVFH-NEXT: .LBB7_2:
+; ZVFH-NEXT: .LBB15_2:
; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; ZVFH-NEXT: vfmv.s.f v25, fa0
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
@@ -318,15 +594,15 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; ZVFHMIN-NEXT: slli a4, a3, 1
; ZVFHMIN-NEXT: sub a2, a1, a4
; ZVFHMIN-NEXT: sltu a6, a1, a2
-; ZVFHMIN-NEXT: bltu a1, a4, .LBB7_2
+; ZVFHMIN-NEXT: bltu a1, a4, .LBB15_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a4
-; ZVFHMIN-NEXT: .LBB7_2:
+; ZVFHMIN-NEXT: .LBB15_2:
; ZVFHMIN-NEXT: addi a6, a6, -1
-; ZVFHMIN-NEXT: bltu a0, a5, .LBB7_4
+; ZVFHMIN-NEXT: bltu a0, a5, .LBB15_4
; ZVFHMIN-NEXT: # %bb.3:
; ZVFHMIN-NEXT: mv a0, a5
-; ZVFHMIN-NEXT: .LBB7_4:
+; ZVFHMIN-NEXT: .LBB15_4:
; ZVFHMIN-NEXT: and a2, a6, a2
; ZVFHMIN-NEXT: sub a5, a0, a4
; ZVFHMIN-NEXT: sltu a6, a0, a5
@@ -335,10 +611,10 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: bltu a0, a4, .LBB7_6
+; ZVFHMIN-NEXT: bltu a0, a4, .LBB15_6
; ZVFHMIN-NEXT: # %bb.5:
; ZVFHMIN-NEXT: mv a0, a4
-; ZVFHMIN-NEXT: .LBB7_6:
+; ZVFHMIN-NEXT: .LBB15_6:
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
@@ -592,12 +868,12 @@ define float @vreduce_fminimum_nxv4f32(float %start, <vscale x 4 x float> %val,
; CHECK-NEXT: feq.s a1, fa0, fa0
; CHECK-NEXT: xori a1, a1, 1
; CHECK-NEXT: or a0, a0, a1
-; CHECK-NEXT: beqz a0, .LBB22_2
+; CHECK-NEXT: beqz a0, .LBB30_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: .LBB30_2:
; CHECK-NEXT: vfmv.f.s fa0, v10
; CHECK-NEXT: ret
%s = call float @llvm.vp.reduce.fminimum.nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 %evl)
@@ -616,12 +892,12 @@ define float @vreduce_fmaximum_nxv4f32(float %start, <vscale x 4 x float> %val,
; CHECK-NEXT: feq.s a1, fa0, fa0
; CHECK-NEXT: xori a1, a1, 1
; CHECK-NEXT: or a0, a0, a1
-; CHECK-NEXT: beqz a0, .LBB23_2
+; CHECK-NEXT: beqz a0, .LBB31_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB23_2:
+; CHECK-NEXT: .LBB31_2:
; CHECK-NEXT: vfmv.f.s fa0, v10
; CHECK-NEXT: ret
%s = call float @llvm.vp.reduce.fmaximum.nxv4f32(float %start, <vscale x 4 x float> %val, <vscale x 4 x i1> %m, i32 %evl)
@@ -666,12 +942,12 @@ define float @vreduce_fminimum_v4f32(float %start, <4 x float> %val, <4 x i1> %m
; CHECK-NEXT: feq.s a1, fa0, fa0
; CHECK-NEXT: xori a1, a1, 1
; CHECK-NEXT: or a0, a0, a1
-; CHECK-NEXT: beqz a0, .LBB26_2
+; CHECK-NEXT: beqz a0, .LBB34_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB26_2:
+; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vfmv.f.s fa0, v9
; CHECK-NEXT: ret
%s = call float @llvm.vp.reduce.fminimum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl)
@@ -690,12 +966,12 @@ define float @vreduce_fmaximum_v4f32(float %start, <4 x float> %val, <4 x i1> %m
; CHECK-NEXT: feq.s a1, fa0, fa0
; CHECK-NEXT: xori a1, a1, 1
; CHECK-NEXT: or a0, a0, a1
-; CHECK-NEXT: beqz a0, .LBB27_2
+; CHECK-NEXT: beqz a0, .LBB35_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: lui a0, 523264
; CHECK-NEXT: fmv.w.x fa0, a0
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB27_2:
+; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vfmv.f.s fa0, v9
; CHECK-NEXT: ret
%s = call float @llvm.vp.reduce.fmaximum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl)
More information about the llvm-commits
mailing list