[llvm] [LLVM][CodeGen] Add lowering for scalable vector bfloat operations. (PR #109803)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 24 07:25:50 PDT 2024
https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/109803
Specifically:
fabs, fadd, fceil, fdiv, ffloor, fma, fmax, fmaxnm, fmin, fminnm,
fmul, fnearbyint, fneg, frint, fround, froundeven, fsub, fsqrt &
ftrunc
>From c638ddda27d8022e904bc36638c90528da6d30f2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 30 Aug 2024 15:59:48 +0100
Subject: [PATCH] [LLVM][CodeGen] Add lowering for scalable vector bfloat
operations.
Specifically:
fabs, fadd, fceil, fdiv, ffloor, fma, fmax, fmaxnm, fmin, fminnm,
fmul, fnearbyint, fneg, frint, fround, froundeven, fsub, fsqrt &
ftrunc
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 6 +
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 3 +
.../Target/AArch64/AArch64ISelLowering.cpp | 88 ++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 +
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 7 +
llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 +
llvm/test/CodeGen/AArch64/sve-bf16-arith.ll | 752 ++++++++++++++++++
.../test/CodeGen/AArch64/sve-bf16-rounding.ll | 355 +++++++++
8 files changed, 1218 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d6c2c36a0d482a..c7e0c704efceff 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1565,6 +1565,12 @@ class SelectionDAG {
SDValue getSetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT,
MachineMemOperand *MMO);
+ SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue V,
+ uint64_t Idx) {
+ return getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+ getVectorIdxConstant(Idx, DL));
+ }
+
/// Construct a node to track a Value* through the backend.
SDValue getSrcValue(const Value *v);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 6067b3b29ea181..82bba661dba0f9 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -191,6 +191,9 @@ class SDValue {
return getValueType().getSimpleVT();
}
+ /// Return the scalar ValueType of the referenced return value.
+ EVT getScalarValueType() const { return getValueType().getScalarType(); }
+
/// Returns the size of the value in bits.
///
/// If the value type is a scalable vector type, the scalable property will
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4166d9bd22bc01..c77d9631b5ffab 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1663,12 +1663,32 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::FCEIL, VT, Custom);
+ setOperationAction(ISD::FDIV, VT, Custom);
+ setOperationAction(ISD::FFLOOR, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMAXIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMINNUM, VT, Custom);
+ setOperationAction(ISD::FNEARBYINT, VT, Custom);
setOperationAction(ISD::FP_EXTEND, VT, Custom);
setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::FRINT, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FROUNDEVEN, VT, Custom);
+ setOperationAction(ISD::FSQRT, VT, Custom);
+ setOperationAction(ISD::FTRUNC, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+
+ if (!Subtarget->hasSVEB16B16()) {
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FSUB, VT, Custom);
+ }
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
@@ -7051,32 +7071,58 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
case ISD::FSUB:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
case ISD::FMUL:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
case ISD::FMA:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
case ISD::FDIV:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
case ISD::FNEG:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
case ISD::FCEIL:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
case ISD::FFLOOR:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
case ISD::FNEARBYINT:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
case ISD::FRINT:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
case ISD::FROUND:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
case ISD::FROUNDEVEN:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
case ISD::FTRUNC:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
case ISD::FSQRT:
+ if (Op.getScalarValueType() == MVT::bf16)
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
case ISD::FABS:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
@@ -7242,12 +7288,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::SUB:
return LowerToScalableOp(Op, DAG);
case ISD::FMAXIMUM:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
case ISD::FMAXNUM:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
case ISD::FMINIMUM:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
case ISD::FMINNUM:
+ if (Op.getScalarValueType() == MVT::bf16 && !Subtarget->hasSVEB16B16())
+ return LowerBFloatOp(Op, DAG);
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
case ISD::VSELECT:
return LowerFixedLengthVectorSelectToSVE(Op, DAG);
@@ -28466,6 +28520,40 @@ SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
return convertFromScalableVector(DAG, VT, ScalableRes);
}
+// Lower bfloat16 operations by upcasting to float32, performing the operation
+// and then downcasting the result back to bfloat16.
+SDValue AArch64TargetLowering::LowerBFloatOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ assert(isTypeLegal(VT) && VT.isScalableVector() && "Unexpected type!");
+
+ // Split the vector and try again.
+ if (VT == MVT::nxv8bf16) {
+ SmallVector<SDValue, 4> LoOps, HiOps;
+ for (const SDValue &V : Op->op_values()) {
+ LoOps.push_back(DAG.getExtractSubvector(DL, MVT::nxv4bf16, V, 0));
+ HiOps.push_back(DAG.getExtractSubvector(DL, MVT::nxv4bf16, V, 4));
+ }
+
+ unsigned Opc = Op.getOpcode();
+ SDValue SplitOpLo = DAG.getNode(Opc, DL, MVT::nxv4bf16, LoOps);
+ SDValue SplitOpHi = DAG.getNode(Opc, DL, MVT::nxv4bf16, HiOps);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SplitOpLo, SplitOpHi);
+ }
+
+ // Promote to float and try again.
+ EVT PromoteVT = VT.changeVectorElementType(MVT::f32);
+
+ SmallVector<SDValue, 4> Ops;
+ for (const SDValue &V : Op->op_values())
+ Ops.push_back(DAG.getNode(ISD::FP_EXTEND, DL, PromoteVT, V));
+
+ SDValue PromotedOp = DAG.getNode(Op.getOpcode(), DL, PromoteVT, Ops);
+ return DAG.getNode(ISD::FP_ROUND, DL, VT, PromotedOp,
+ DAG.getIntPtrConstant(0, DL, true));
+}
+
// Convert vector operation 'Op' to an equivalent predicated operation whereby
// the original operation's type is used to construct a suitable predicate.
// NOTE: The results for inactive lanes are undefined.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 480bf60360bf55..8c06214bba5b54 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1224,6 +1224,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBFloatOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 7240f6a22a87bd..078f4f2e14cabf 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -663,6 +663,13 @@ let Predicates = [HasSVEorSME] in {
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;
+ foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in {
+ def : Pat<(VT (fabs VT:$op)),
+ (AND_ZI $op, (i64 (logical_imm64_XFORM(i64 0x7fff7fff7fff7fff))))>;
+ def : Pat<(VT (fneg VT:$op)),
+ (EOR_ZI $op, (i64 (logical_imm64_XFORM(i64 0x8000800080008000))))>;
+ }
+
// zext(cmpeq(x, splat(0))) -> cnot(x)
def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))),
(CNOT_ZPmZ_B $Op2, $Pg, $Op2)>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 0bfac6465a1f30..c7059b8e4e8d4a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2299,6 +2299,8 @@ multiclass sve_fp_3op_u_zd_bfloat<bits<3> opc, string asm, SDPatternOperator op>
def NAME : sve_fp_3op_u_zd<0b00, opc, asm, ZPR16>;
def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv4bf16, op, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv2bf16, op, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME)>;
}
multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
@@ -9078,6 +9080,8 @@ multiclass sve_fp_bin_pred_bfloat<SDPatternOperator op> {
def _UNDEF : PredTwoOpPseudo<NAME, ZPR16, FalseLanesUndef>;
def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _UNDEF)>;
+ def : SVE_3_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, !cast<Pseudo>(NAME # _UNDEF)>;
+ def : SVE_3_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, !cast<Pseudo>(NAME # _UNDEF)>;
}
// Predicated pseudo floating point three operand instructions.
@@ -9099,6 +9103,8 @@ multiclass sve_fp_3op_pred_bfloat<SDPatternOperator op> {
def _UNDEF : PredThreeOpPseudo<NAME, ZPR16, FalseLanesUndef>;
def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _UNDEF)>;
+ def : SVE_4_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME # _UNDEF)>;
+ def : SVE_4_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME # _UNDEF)>;
}
// Predicated pseudo integer two operand instructions.
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
new file mode 100644
index 00000000000000..e8468ddfeed181
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -0,0 +1,752 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FABS
+;
+
+define <vscale x 2 x bfloat> @fabs_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fabs_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0x7fff
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fabs_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fabs_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0x7fff
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fabs_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fabs_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0x7fff
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FADD
+;
+
+define <vscale x 2 x bfloat> @fadd_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fadd <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fadd_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fadd <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fadd_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fadd z2.s, z3.s, z2.s
+; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fadd <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FDIV
+;
+
+define <vscale x 2 x bfloat> @fdiv_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; CHECK-LABEL: fdiv_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fdiv <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fdiv_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; CHECK-LABEL: fdiv_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fdiv <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fdiv_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: fdiv_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z2.s, z2.s, #16
+; CHECK-NEXT: lsl z3.s, z3.s, #16
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z2.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = fdiv <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMAX
+;
+
+define <vscale x 2 x bfloat> @fmax_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmax_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmax_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmax z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMAXNM
+;
+
+define <vscale x 2 x bfloat> @fmaxnm_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmaxnm_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmaxnm_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMIN
+;
+
+define <vscale x 2 x bfloat> @fmin_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmin_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmin_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmin z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMINNM
+;
+
+define <vscale x 2 x bfloat> @fminnm_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fminnm_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fminnm_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fminnm z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMLA
+;
+
+define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmla_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmla_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z3.s, z1.h
+; NOB16B16-NEXT: uunpkhi z4.s, z0.h
+; NOB16B16-NEXT: uunpkhi z5.s, z2.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: uunpklo z2.s, z2.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z4.s, z4.s, #16
+; NOB16B16-NEXT: lsl z5.s, z5.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
+; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmla_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMUL
+;
+
+define <vscale x 2 x bfloat> @fmul_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmul_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fmul <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmul_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmul_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fmul <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmul z2.s, z3.s, z2.s
+; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmul_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fmul <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FNEG
+;
+
+define <vscale x 2 x bfloat> @fneg_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fneg_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor z0.h, z0.h, #0x8000
+; CHECK-NEXT: ret
+ %res = fneg <vscale x 2 x bfloat> %a
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fneg_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fneg_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor z0.h, z0.h, #0x8000
+; CHECK-NEXT: ret
+ %res = fneg <vscale x 4 x bfloat> %a
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fneg_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fneg_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor z0.h, z0.h, #0x8000
+; CHECK-NEXT: ret
+ %res = fneg <vscale x 8 x bfloat> %a
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FSQRT
+;
+
+define <vscale x 2 x bfloat> @fsqrt_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fsqrt_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fsqrt_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FSUB
+;
+
+define <vscale x 2 x bfloat> @fsub_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fsub_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fsub <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fsub_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fsub_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fsub <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fsub_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fsub z2.s, z3.s, z2.s
+; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fsub_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fsub <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll b/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
new file mode 100644
index 00000000000000..65d273d1209827
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FCEIL
+;
+
+define <vscale x 2 x bfloat> @frintp_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintp_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintp_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintp_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintp_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintp_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintp z1.s, p0/m, z1.s
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FFLOOR
+;
+
+define <vscale x 2 x bfloat> @frintm_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintm_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.floor.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintm_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintm_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.floor.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintm_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintm_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintm z1.s, p0/m, z1.s
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.floor.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FNEARBYINT
+;
+
+define <vscale x 2 x bfloat> @frinti_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frinti_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frinti_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frinti_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frinti_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frinti_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frinti z1.s, p0/m, z1.s
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FRINT
+;
+
+define <vscale x 2 x bfloat> @frintx_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintx_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.rint.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintx_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintx_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.rint.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintx_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintx_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintx z1.s, p0/m, z1.s
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.rint.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; ROUND
+;
+
+define <vscale x 2 x bfloat> @frinta_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frinta_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.round.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frinta_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frinta_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.round.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frinta_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frinta_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frinta z1.s, p0/m, z1.s
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.round.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; ROUNDEVEN
+;
+
+define <vscale x 2 x bfloat> @frintn_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintn_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintn_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintn_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintn_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintn_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintn z1.s, p0/m, z1.s
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FTRUNC
+;
+
+define <vscale x 2 x bfloat> @frintz_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintz_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintz_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintz_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintz_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintz_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintz z1.s, p0/m, z1.s
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.floor.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.floor.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.floor.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.rint.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.rint.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.rint.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.round.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.round.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.round.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16( <vscale x 8 x bfloat>)
More information about the llvm-commits
mailing list