[llvm] [clang] Header (PR #73258)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Nov 23 10:39:58 PST 2023
https://github.com/CarolineConcatto created https://github.com/llvm/llvm-project/pull/73258
None
>From a092c0299b7a8fd28255f39b5b331eff5d79df3c Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Wed, 22 Nov 2023 10:30:08 +0000
Subject: [PATCH 1/2] Revert "Revert "[SVE2.1][Clang][LLVM]Add BFloat16 builtin
in Clang and LLVM intrinisc (#70362)""
This reverts commit e1ee0e85104eed2c68b6821d9e5a2066e4154099.
---
clang/include/clang/Basic/arm_sve.td | 18 ++-
.../acle_sve2p1_bfadd.c | 133 +++++++++++++++++
.../acle_sve2p1_bfclamp.c | 31 ++++
.../acle_sve2p1_bfmax.c | 134 ++++++++++++++++++
.../acle_sve2p1_bfmaxnm.c | 134 ++++++++++++++++++
.../acle_sve2p1_bfmin.c | 134 ++++++++++++++++++
.../acle_sve2p1_bfminnm.c | 134 ++++++++++++++++++
.../acle_sve2p1_bfmla.c | 133 +++++++++++++++++
.../acle_sve2p1_bfmla_lane.c | 60 ++++++++
.../acle_sve2p1_bfmls.c | 133 +++++++++++++++++
.../acle_sve2p1_bfmls_lane.c | 60 ++++++++
.../acle_sve2p1_bfmul.c | 134 ++++++++++++++++++
.../acle_sve2p1_bfmul_lane.c | 61 ++++++++
.../acle_sve2p1_bfsub.c | 134 ++++++++++++++++++
.../acle_sve2p1_imm.cpp | 17 +++
llvm/lib/Target/AArch64/AArch64.td | 8 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 62 +++++---
llvm/lib/Target/AArch64/SVEInstrFormats.td | 58 +++++++-
.../AArch64/sve2p1-intrinsics-bfadd.ll | 62 ++++++++
.../AArch64/sve2p1-intrinsics-bfclamp.ll | 13 ++
.../AArch64/sve2p1-intrinsics-bfmax.ll | 74 ++++++++++
.../AArch64/sve2p1-intrinsics-bfmaxnm.ll | 74 ++++++++++
.../AArch64/sve2p1-intrinsics-bfmin.ll | 74 ++++++++++
.../AArch64/sve2p1-intrinsics-bfminnm.ll | 74 ++++++++++
.../AArch64/sve2p1-intrinsics-bfmla.ll | 35 +++++
.../AArch64/sve2p1-intrinsics-bfmla_lane.ll | 31 ++++
.../AArch64/sve2p1-intrinsics-bfmls.ll | 49 +++----
.../AArch64/sve2p1-intrinsics-bfmls_lane.ll | 31 ++++
.../AArch64/sve2p1-intrinsics-bfmlsl.ll | 43 ++++++
.../AArch64/sve2p1-intrinsics-bfmul.ll | 62 ++++++++
.../AArch64/sve2p1-intrinsics-bfmul_lane.ll | 37 +++++
.../AArch64/sve2p1-intrinsics-bfsub.ll | 62 ++++++++
llvm/test/MC/AArch64/SVE2p1/bfadd.s | 20 +--
llvm/test/MC/AArch64/SVE2p1/bfclamp.s | 10 +-
llvm/test/MC/AArch64/SVE2p1/bfmax.s | 12 +-
llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s | 12 +-
llvm/test/MC/AArch64/SVE2p1/bfmin.s | 12 +-
llvm/test/MC/AArch64/SVE2p1/bfminnm.s | 12 +-
llvm/test/MC/AArch64/SVE2p1/bfmla.s | 22 +--
llvm/test/MC/AArch64/SVE2p1/bfmls.s | 22 +--
llvm/test/MC/AArch64/SVE2p1/bfmul.s | 28 ++--
llvm/test/MC/AArch64/SVE2p1/bfsub.s | 20 +--
42 files changed, 2332 insertions(+), 137 deletions(-)
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index bcdddd6b0874f28d..a1ac926ab9577bb7 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2039,7 +2039,23 @@ def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sv
defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">;
}
-////////////////////////////////////////////////////////////////////////////////
+
+let TargetGuard = "sve2p1,b16b16" in {
+defm SVMUL_BF : SInstZPZZ<"svmul", "b", "aarch64_sve_fmul", "aarch64_sve_fmul_u">;
+defm SVADD_BF : SInstZPZZ<"svadd", "b", "aarch64_sve_fadd", "aarch64_sve_fadd_u">;
+defm SVSUB_BF : SInstZPZZ<"svsub", "b", "aarch64_sve_fsub", "aarch64_sve_fsub_u">;
+defm SVMAXNM_BF : SInstZPZZ<"svmaxnm","b", "aarch64_sve_fmaxnm", "aarch64_sve_fmaxnm_u">;
+defm SVMINNM_BF : SInstZPZZ<"svminnm","b", "aarch64_sve_fminnm", "aarch64_sve_fminnm_u">;
+defm SVMAX_BF : SInstZPZZ<"svmax", "b", "aarch64_sve_fmax", "aarch64_sve_fmax_u">;
+defm SVMIN_BF : SInstZPZZ<"svmin", "b", "aarch64_sve_fmin", "aarch64_sve_fmin_u">;
+defm SVMLA_BF : SInstZPZZZ<"svmla", "b", "aarch64_sve_fmla", "aarch64_sve_fmla_u", []>;
+defm SVMLS_BF : SInstZPZZZ<"svmls", "b", "aarch64_sve_fmls", "aarch64_sve_fmls_u", []>;
+def SVMLA_LANE_BF : SInst<"svmla_lane[_{d}]", "ddddi", "b", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLS_LANE_BF : SInst<"svmls_lane[_{d}]", "ddddi", "b", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMUL_LANE_BF : SInst<"svmul_lane[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVFCLAMP_BF : SInst<"svclamp[_{d}]", "dddd", "b", MergeNone, "aarch64_sve_fclamp", [], []>;
+} //sve2p1,b16b16
+
// SME2
// SME intrinsics which operate only on vectors and do not require ZA should be added here,
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
new file mode 100644
index 0000000000000000..327c4f078872b3c4
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
@@ -0,0 +1,133 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svadd_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svadd, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svadd, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svadd, _bf16, _x)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svadd, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svadd_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svadd, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svadd, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
new file mode 100644
index 0000000000000000..ddb279147bd1b02d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svclamp_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svclamp_bf16u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svclamp_bf16(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svclamp, _bf16,)(op1, op2, op3);
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
new file mode 100644
index 0000000000000000..0553b993622bde15
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmax_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmax, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmax, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmax, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmax_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmax, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmax_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmax, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmax, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
new file mode 100644
index 0000000000000000..fbbafde686edba00
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmaxnm_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmaxnm, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmaxnm, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmaxnm, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmaxnm_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmaxnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
new file mode 100644
index 0000000000000000..bf774ee0cef66168
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmin_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmin, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmin, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmin, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmin_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmin, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmin_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmin, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmin, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
new file mode 100644
index 0000000000000000..cf00f0d504522bb2
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svminnm_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svminnm, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svminnm, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svminnm, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svminnm_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svminnm, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svminnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svminnm, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svminnm, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
new file mode 100644
index 0000000000000000..0e1532563f8bb813
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
@@ -0,0 +1,133 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmla_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_S0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla, _bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_S0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmla_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla, _bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_S0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla, _bf16, _x)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_n_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_u6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla, _n_bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_n_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_u6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmla_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla, _n_bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_n_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_u6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla, _n_bf16, _x)(pg, op1, op2, op3);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
new file mode 100644
index 0000000000000000..e408c20e325b5b6d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmla_lane_bf16_idx1(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx1u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmla_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla_lane, _bf16,)(op1, op2, op3, 1);
+}
+
+// CHECK-LABEL: @test_svmla_lane_bf16_idx3(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx3u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmla_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla_lane ,_bf16,)(op1, op2, op3, 3);
+}
+
+// CHECK-LABEL: @test_svmla_lane_bf16_idx7(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx7u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmla_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmla_lane, _bf16,)(op1, op2, op3, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
new file mode 100644
index 0000000000000000..b7d576ea01df6fdf
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
@@ -0,0 +1,133 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmls_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_S0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls, _bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_S0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmls_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls, _bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_S0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls, _bf16, _x)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_n_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_u6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls, _n_bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_n_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_u6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmls_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls, _n_bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_n_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_u6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls, _n_bf16, _x)(pg, op1, op2, op3);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
new file mode 100644
index 0000000000000000..f4d3f9e9bd60a42e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmls_lane_bf16_idx1(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx1u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmls_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 1);
+}
+
+// CHECK-LABEL: @test_svmls_lane_bf16_idx3(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx3u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmls_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 3);
+}
+
+// CHECK-LABEL: @test_svmls_lane_bf16_idx7(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx7u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmls_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+ return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
new file mode 100644
index 0000000000000000..8b0de974f2473bcb
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmul_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmul_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmul_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
new file mode 100644
index 0000000000000000..44cdf49c57bb86f1
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
@@ -0,0 +1,61 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmul_lane_bf16_idx1(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx1u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmul_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 1);
+}
+
+// CHECK-LABEL: @test_svmul_lane_bf16_idx3(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 3)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx3u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 3)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmul_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 3);
+}
+
+// CHECK-LABEL: @test_svmul_lane_bf16_idx7(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx7u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 7)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmul_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 7);
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
new file mode 100644
index 0000000000000000..c5cdf7efa445b84b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svsub_bf16_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_mu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svsub, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_zu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svsub, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_xu10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svsub, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svsub_bf16_n_m(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_mu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svsub, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_n_z(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_zu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svsub_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svsub, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_n_x(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_xu10__SVBool_tu14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+ return SVE_ACLE_FUNC(svsub, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp
index 84fdba432c24497a..35bf99bfcf4c5b07 100644
--- a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp
+++ b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp
@@ -117,6 +117,23 @@ void test_svdot_lane_2way(svint32_t s32, svuint32_t u32, svint16_t s16, svuint16
svdot_lane_f32_f16_f16(f32, f16, f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
}
+
+__attribute__((target("+sve2p1+b16b16")))
+void test_svbfml_lane(svbfloat16_t zda, svbfloat16_t zn, svbfloat16_t zm, uint64_t idx){
+ svmla_lane_bf16(zda, zn, zm, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmla_lane_bf16(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ svmls_lane_bf16(zda, zn, zm, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmls_lane_bf16(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ svmla_lane_bf16(zda, zn, zm, idx); // expected-errcor {{argument to 'svmla_lane_bf16' must be a constant integer}}
+ svmls_lane_bf16(zda, zn, zm, idx); // expected-error {{argument to 'svmla_lane_bf16' must be a constant integer}}
+}
+
+__attribute__((target("+sve2p1+b16b16")))
+void test_svbfmul_lane(svbfloat16_t zn, svbfloat16_t zm, uint64_t idx){
+ svmul_lane_bf16(zn, zm, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmul_lane_bf16(zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ svmul_lane_bf16(zn, zm, idx); // expected-error {{argument to 'svmul_lane_bf16' must be a constant integer}}
+
__attribute__((target("+sve2p1")))
void test_svextq_lane(svint16_t zn_i16, svint16_t zm_i16, svfloat16_t zn_f16, svfloat16_t zm_f16){
svextq_lane_s16(zn_i16, zm_i16, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 7176139ec1b73027..914ad0b68a624f65 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -154,6 +154,9 @@ def FeatureExperimentalZeroingPseudos
def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
"UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
+def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
+ "true", "Enable BFloat16 Extension (FEAT_BF16)" >;
+
def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
"NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
@@ -178,7 +181,7 @@ def FeatureSVE2p1: SubtargetFeature<"sve2p1", "HasSVE2p1", "true",
"Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
def FeatureB16B16 : SubtargetFeature<"b16b16", "HasB16B16", "true",
- "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", []>;
+ "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", [FeatureBF16]>;
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
@@ -447,9 +450,6 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
"true", "Use an instruction sequence for taking the address of a global "
"that allows a memory tag in the upper address bits">;
-def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
- "true", "Enable BFloat16 Extension (FEAT_BF16)" >;
-
def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
"true", "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 9b17f884083a7676..21cafe9b6c4453a9 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4006,28 +4006,56 @@ def : InstAlias<"pfalse\t$Pd", (PFALSE PNRasPPR8:$Pd), 0>;
// SVE2.1 non-widening BFloat16 to BFloat16 instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2p1_or_HasSME2p1, HasB16B16] in {
-def BFADD_ZZZ : sve_fp_3op_u_zd<0b00, 0b000, "bfadd", ZPR16>;
-def BFSUB_ZZZ : sve_fp_3op_u_zd<0b00, 0b001, "bfsub", ZPR16>;
-def BFMUL_ZZZ : sve_fp_3op_u_zd<0b00, 0b010, "bfmul", ZPR16>;
+let Predicates = [HasSVE2p1, HasB16B16, UseExperimentalZeroingPseudos] in {
+defm BFADD_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fadd>;
+defm BFSUB_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fsub>;
+defm BFMUL_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmul>;
+defm BFMAXNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmaxnm>;
+defm BFMINNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fminnm>;
+defm BFMIN_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmin>;
+defm BFMAX_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmax>;
+} //HasSVE2p1_or_HasSME2p1, HasB16B16, UseExperimentalZeroingPseudos
-def BFMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b00, "bfmla", ZPR16>;
-def BFMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b01, "bfmls", ZPR16>;
+let Predicates = [HasSVE2p1, HasB16B16] in {
-def BFADD_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0000, "bfadd", ZPR16>;
-def BFSUB_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0001, "bfsub", ZPR16>;
-def BFMUL_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0010, "bfmul", ZPR16>;
-def BFMAXNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0100, "bfmaxnm", ZPR16>;
-def BFMINNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0101, "bfminnm", ZPR16>;
-def BFMAX_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0110, "bfmax", ZPR16>;
-def BFMIN_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0111, "bfmin", ZPR16>;
+defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>;
+defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>;
-defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10>;
-defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11>;
+defm BFMLA_ZPZZZ : sve_fp_3op_pred_bf<AArch64fmla_p>;
+defm BFMLS_ZPZZZ : sve_fp_3op_pred_bf<AArch64fmls_p>;
-defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul">;
+defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10, int_aarch64_sve_fmla_lane>;
+defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11, int_aarch64_sve_fmls_lane>;
-def BFCLAMP_ZZZ : sve2p1_fclamp<"bfclamp", 0b00, ZPR16>;
+defm BFADD_ZPmZZ : sve2p1_bf_2op_p_zds<0b0000, "bfadd", "BFADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
+defm BFSUB_ZPmZZ : sve2p1_bf_2op_p_zds<0b0001, "bfsub", "BFSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryComm>;
+defm BFMUL_ZPmZZ : sve2p1_bf_2op_p_zds<0b0010, "bfmul", "BFMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;
+
+defm BFADD_ZZZ : sve2p1_bf_3op_u_zd<0b000, "bfadd", fadd, AArch64fadd_p>;
+defm BFSUB_ZZZ : sve2p1_bf_3op_u_zd<0b001, "bfsub", fsub, AArch64fsub_p>;
+defm BFMUL_ZZZ : sve2p1_bf_3op_u_zd<0b010, "bfmul", fmul, AArch64fmul_p>;
+
+defm BFADD_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fadd_p>;
+defm BFSUB_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fsub_p>;
+defm BFMUL_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmul_p>;
+
+
+defm BFMAX_ZPmZZ : sve2p1_bf_2op_p_zds<0b0110, "bfmax", "BFMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
+defm BFMIN_ZPmZZ : sve2p1_bf_2op_p_zds<0b0111, "bfmin", "BFMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
+
+defm BFMAX_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmax_p>;
+defm BFMIN_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmin_p>;
+
+
+defm BFMAXNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0100, "bfmaxnm", "BFMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
+defm BFMINNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0101, "bfminnm", "BFMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
+
+defm BFMAXNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmaxnm_p>;
+defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
+
+defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
+
+defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", int_aarch64_sve_fclamp>;
} // End HasSVE2p1_or_HasSME2p1, HasB16B16
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 952ef280f2769899..e765926d8a6355ea 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2118,6 +2118,29 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
let mayRaiseFPException = 1;
}
+multiclass sve2p1_bf_2op_p_zds<bits<4> opc, string asm, string Ps,
+ SDPatternOperator op, DestructiveInstTypeEnum flags,
+ string revname="", bit isReverseInstr=0> {
+let DestructiveInstType = flags in {
+ def NAME : sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME , revname , isReverseInstr>;
+ }
+
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
+multiclass sve2p1_bf_bin_pred_zds<SDPatternOperator op> {
+ def _UNDEF : PredTwoOpPseudo<NAME, ZPR16, FalseLanesUndef>;
+
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _UNDEF)>;
+}
+
+multiclass sve2p1_bf_2op_p_zds_zeroing<SDPatternOperator op> {
+ def _ZERO : PredTwoOpPseudo<NAME, ZPR16, FalseLanesZero>;
+
+ def : SVE_3_Op_Pat_SelZero<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _ZERO)>;
+}
+
multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
SDPatternOperator op, DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
@@ -2266,6 +2289,14 @@ multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op,
def : SVE_2_Op_Pred_All_Active<nxv2f64, predicated_op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve2p1_bf_3op_u_zd<bits<3> opc1, string asm, SDPatternOperator op,
+ SDPatternOperator predicated_op = null_frag> {
+ def NAME : sve_fp_3op_u_zd<0b00, opc1, asm, ZPR16>;
+ def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+
+ def : SVE_2_Op_Pred_All_Active<nxv8bf16, predicated_op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
@@ -2324,6 +2355,14 @@ multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, string Ps,
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve_fp_3op_p_zds_a_bf<bits<2> opc, string asm, string Ps,
+ SDPatternOperator op> {
+ def NAME : sve_fp_3op_p_zds_a<0b00, opc, asm, ZPR16>,
+ SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME, "", 0>;
+
+ def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
@@ -2391,7 +2430,7 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bits<2> opc, string asm,
let mayRaiseFPException = 1;
}
-multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc> {
+multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc, SDPatternOperator op> {
def NAME : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16,
VectorIndexH32b> {
bits<3> Zm;
@@ -2400,6 +2439,8 @@ multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc> {
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
+ def : Pat<(nxv8bf16 (op nxv8bf16:$op1, nxv8bf16:$op2, nxv8bf16:$op3, (i32 VectorIndexH32b_timm:$idx))),
+ (!cast<Instruction>(NAME) $op1, $op2, $op3, VectorIndexH32b_timm:$idx)>;
}
multiclass sve_fp_fma_by_indexed_elem<bits<2> opc, string asm,
@@ -2456,7 +2497,7 @@ class sve_fp_fmul_by_indexed_elem<bits<2> sz, bit o2, string asm, ZPRRegOp zprty
let mayRaiseFPException = 1;
}
-multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm> {
+multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm, SDPatternOperator ir_intrinsic> {
def NAME : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b1, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
@@ -2464,6 +2505,8 @@ multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm> {
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
+ def : Pat <(nxv8bf16 (ir_intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
+ (!cast<Instruction>(NAME) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
}
multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
@@ -9100,6 +9143,12 @@ multiclass sve_fp_3op_pred_hfd<SDPatternOperator op> {
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D_UNDEF)>;
}
+multiclass sve_fp_3op_pred_bf<SDPatternOperator op> {
+ def _UNDEF : PredThreeOpPseudo<NAME, ZPR16, FalseLanesUndef>;
+
+ def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _UNDEF)>;
+}
+
// Predicated pseudo integer two operand instructions.
multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> {
def _B_UNDEF : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
@@ -9185,6 +9234,11 @@ multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
+multiclass sve2p1_bfclamp<string asm, SDPatternOperator op> {
+ def NAME : sve2p1_fclamp<asm, 0b00, ZPR16>;
+ def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
// SVE two-way dot product
class sve2p1_two_way_dot_vv<string mnemonic, bit u>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll
new file mode 100644
index 0000000000000000..221bb3b6045fb296
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfadd_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfadd_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfadd_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfadd_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfadd_u(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfadd_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfadd_u_ptrue(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfadd_u_ptrue:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfadd z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfadd_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfadd_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
new file mode 100644
index 0000000000000000..61b67755a35441e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfclamp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfclamp z0.h, z1.h, z2.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll
new file mode 100644
index 0000000000000000..24c4fedb34266285
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmax_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmax_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmax_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax_u_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmax_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll
new file mode 100644
index 0000000000000000..25fe9cf7243a4e42
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmaxnm_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmaxnm_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm_u_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmaxnm_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll
new file mode 100644
index 0000000000000000..d5b0b8be8b85ea9f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmin_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmin_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmin_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin_u_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmin_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll
new file mode 100644
index 0000000000000000..c019dc7cbe291049
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfminnm_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfminnm_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfminnm_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm_u_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfminnm_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll
new file mode 100644
index 0000000000000000..02b1db13ea34f7c5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmla_m(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_m:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_x(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_x:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_z(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_z:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h
+; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a_z, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll
new file mode 100644
index 0000000000000000..d0e3a82df3ff919a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmla_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_lane_idx1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmla z0.h, z1.h, z2.h[1]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_lane_idx3(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_lane_idx3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmla z0.h, z1.h, z2.h[3]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_lane_idx7(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_lane_idx7:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmla z0.h, z1.h, z2.h[7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 7)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
index 04ad000f20f070a0..987fe1fb5822aa47 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
@@ -1,43 +1,36 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
-define <vscale x 4 x float> @bfmlslb_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslb_f32:
+define <vscale x 8 x bfloat> @bfmls_m(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_m:
; CHECK: // %bb.0:
-; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h
+; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
- %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
- ret <vscale x 4 x float> %out
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
}
-define <vscale x 4 x float> @bfmlslt_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslt_f32:
+define <vscale x 8 x bfloat> @bfmls_x(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_x:
; CHECK: // %bb.0:
-; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h
+; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
- %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
- ret <vscale x 4 x float> %out
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
}
-define <vscale x 4 x float> @bfmlslb_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslb_lane_f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h[7]
-; CHECK-NEXT: ret
- %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
- ret <vscale x 4 x float> %out
-}
-define <vscale x 4 x float> @bfmlslt_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslt_lane_f32:
+define <vscale x 8 x bfloat> @bfmls_z(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_z:
; CHECK: // %bb.0:
-; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h[7]
+; CHECK-NEXT: mov z3.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h
+; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: ret
- %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
- ret <vscale x 4 x float> %out
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a_z, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
}
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll
new file mode 100644
index 0000000000000000..16b4538ffab9e2b2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmls_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_lane_idx1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmls z0.h, z1.h, z2.h[1]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmls_lane_idx3(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_lane_idx3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmls z0.h, z1.h, z2.h[3]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmls_lane_idx7(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_lane_idx7:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmls z0.h, z1.h, z2.h[7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 7)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll
new file mode 100644
index 0000000000000000..2b96d452fba0d506
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 4 x float> @bfmlslb_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslb_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslt_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslt_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslb_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslb_lane_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmlslb z0.s, z1.h, z2.h[7]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+ ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslt_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslt_lane_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmlslt z0.s, z1.h, z2.h[7]
+; CHECK-NEXT: ret
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+ ret <vscale x 4 x float> %out
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll
new file mode 100644
index 0000000000000000..a04c5a52139cdfa9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmul_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmul_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmul_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmul_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmul_u_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmul_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmul_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmul z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmul_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll
new file mode 100644
index 0000000000000000..2962d59e707ca205
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmul_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_lane_idx1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[1]
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> %a,
+ <vscale x 8 x bfloat> %b,
+ i32 1)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmul_lane_idx3(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_lane_idx3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[3]
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> %a,
+ <vscale x 8 x bfloat> %b,
+ i32 3)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmul_lane_idx7(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_lane_idx7:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfmul z0.h, z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> %a,
+ <vscale x 8 x bfloat> %b,
+ i32 7)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll
new file mode 100644
index 0000000000000000..752b5ae9df630761
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfsub_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfsub_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfsub_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfsub_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfsub_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfsub_u_pred:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfsub_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfsub_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfsub z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfsub_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfsub_u_zeroing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+ %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> %pg,
+ <vscale x 8 x bfloat> %a_z,
+ <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfadd.s b/llvm/test/MC/AArch64/SVE2p1/bfadd.s
index 1021df12fc05034c..a29f3e6af8ba48d2 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfadd.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfadd.s
@@ -16,7 +16,7 @@ bfadd z23.h, p3/m, z23.h, z13.h // 01100101-00000000-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfadd z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65008db7 <unknown>
movprfx z23, z31
@@ -24,53 +24,53 @@ bfadd z23.h, p3/m, z23.h, z13.h // 01100101-00000000-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfadd z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65008db7 <unknown>
bfadd z0.h, p0/m, z0.h, z0.h // 01100101-00000000-10000000-00000000
// CHECK-INST: bfadd z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65008000 <unknown>
bfadd z21.h, p5/m, z21.h, z10.h // 01100101-00000000-10010101-01010101
// CHECK-INST: bfadd z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65009555 <unknown>
bfadd z23.h, p3/m, z23.h, z13.h // 01100101-00000000-10001101-10110111
// CHECK-INST: bfadd z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65008db7 <unknown>
bfadd z31.h, p7/m, z31.h, z31.h // 01100101-00000000-10011111-11111111
// CHECK-INST: bfadd z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65009fff <unknown>
bfadd z0.h, z0.h, z0.h // 01100101-00000000-00000000-00000000
// CHECK-INST: bfadd z0.h, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x00,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65000000 <unknown>
bfadd z21.h, z10.h, z21.h // 01100101-00010101-00000001-01010101
// CHECK-INST: bfadd z21.h, z10.h, z21.h
// CHECK-ENCODING: [0x55,0x01,0x15,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65150155 <unknown>
bfadd z23.h, z13.h, z8.h // 01100101-00001000-00000001-10110111
// CHECK-INST: bfadd z23.h, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x01,0x08,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 650801b7 <unknown>
bfadd z31.h, z31.h, z31.h // 01100101-00011111-00000011-11111111
// CHECK-INST: bfadd z31.h, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x03,0x1f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 651f03ff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfclamp.s b/llvm/test/MC/AArch64/SVE2p1/bfclamp.s
index d7b85edb1730ed49..aed96f3d91e9876e 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfclamp.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfclamp.s
@@ -17,30 +17,30 @@ bfclamp z23.h, z13.h, z8.h // 01100100-00101000-00100101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfclamp z23.h, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x25,0x28,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 642825b7 <unknown>
bfclamp z0.h, z0.h, z0.h // 01100100-00100000-00100100-00000000
// CHECK-INST: bfclamp z0.h, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x24,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64202400 <unknown>
bfclamp z21.h, z10.h, z21.h // 01100100-00110101-00100101-01010101
// CHECK-INST: bfclamp z21.h, z10.h, z21.h
// CHECK-ENCODING: [0x55,0x25,0x35,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64352555 <unknown>
bfclamp z23.h, z13.h, z8.h // 01100100-00101000-00100101-10110111
// CHECK-INST: bfclamp z23.h, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x25,0x28,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 642825b7 <unknown>
bfclamp z31.h, z31.h, z31.h // 01100100-00111111-00100111-11111111
// CHECK-INST: bfclamp z31.h, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x27,0x3f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 643f27ff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmax.s b/llvm/test/MC/AArch64/SVE2p1/bfmax.s
index cd67abc498f3bd2d..bf69c0a0406868c4 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmax.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmax.s
@@ -17,7 +17,7 @@ bfmax z23.h, p3/m, z23.h, z13.h // 01100101-00000110-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfmax z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65068db7 <unknown>
movprfx z23, z31
@@ -25,29 +25,29 @@ bfmax z23.h, p3/m, z23.h, z13.h // 01100101-00000110-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmax z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65068db7 <unknown>
bfmax z0.h, p0/m, z0.h, z0.h // 01100101-00000110-10000000-00000000
// CHECK-INST: bfmax z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65068000 <unknown>
bfmax z21.h, p5/m, z21.h, z10.h // 01100101-00000110-10010101-01010101
// CHECK-INST: bfmax z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65069555 <unknown>
bfmax z23.h, p3/m, z23.h, z13.h // 01100101-00000110-10001101-10110111
// CHECK-INST: bfmax z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65068db7 <unknown>
bfmax z31.h, p7/m, z31.h, z31.h // 01100101-00000110-10011111-11111111
// CHECK-INST: bfmax z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65069fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s b/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s
index 83669ebc42b1fda6..8e4ffc31218ab5c4 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s
@@ -17,7 +17,7 @@ bfmaxnm z23.h, p3/m, z23.h, z13.h // 01100101-00000100-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfmaxnm z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65048db7 <unknown>
movprfx z23, z31
@@ -25,30 +25,30 @@ bfmaxnm z23.h, p3/m, z23.h, z13.h // 01100101-00000100-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmaxnm z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65048db7 <unknown>
bfmaxnm z0.h, p0/m, z0.h, z0.h // 01100101-00000100-10000000-00000000
// CHECK-INST: bfmaxnm z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65048000 <unknown>
bfmaxnm z21.h, p5/m, z21.h, z10.h // 01100101-00000100-10010101-01010101
// CHECK-INST: bfmaxnm z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65049555 <unknown>
bfmaxnm z23.h, p3/m, z23.h, z13.h // 01100101-00000100-10001101-10110111
// CHECK-INST: bfmaxnm z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65048db7 <unknown>
bfmaxnm z31.h, p7/m, z31.h, z31.h // 01100101-00000100-10011111-11111111
// CHECK-INST: bfmaxnm z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65049fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmin.s b/llvm/test/MC/AArch64/SVE2p1/bfmin.s
index 1bb3a0e6f1f26399..17bf50913271c567 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmin.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmin.s
@@ -17,7 +17,7 @@ bfmin z23.h, p3/m, z23.h, z13.h // 01100101-00000111-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfmin z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65078db7 <unknown>
movprfx z23, z31
@@ -25,30 +25,30 @@ bfmin z23.h, p3/m, z23.h, z13.h // 01100101-00000111-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmin z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65078db7 <unknown>
bfmin z0.h, p0/m, z0.h, z0.h // 01100101-00000111-10000000-00000000
// CHECK-INST: bfmin z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65078000 <unknown>
bfmin z21.h, p5/m, z21.h, z10.h // 01100101-00000111-10010101-01010101
// CHECK-INST: bfmin z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65079555 <unknown>
bfmin z23.h, p3/m, z23.h, z13.h // 01100101-00000111-10001101-10110111
// CHECK-INST: bfmin z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65078db7 <unknown>
bfmin z31.h, p7/m, z31.h, z31.h // 01100101-00000111-10011111-11111111
// CHECK-INST: bfmin z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65079fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfminnm.s b/llvm/test/MC/AArch64/SVE2p1/bfminnm.s
index 9f444c7ac26ae071..e0cd2adc675eea2f 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfminnm.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfminnm.s
@@ -17,7 +17,7 @@ bfminnm z23.h, p3/m, z23.h, z13.h // 01100101-00000101-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfminnm z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65058db7 <unknown>
movprfx z23, z31
@@ -25,30 +25,30 @@ bfminnm z23.h, p3/m, z23.h, z13.h // 01100101-00000101-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfminnm z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65058db7 <unknown>
bfminnm z0.h, p0/m, z0.h, z0.h // 01100101-00000101-10000000-00000000
// CHECK-INST: bfminnm z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65058000 <unknown>
bfminnm z21.h, p5/m, z21.h, z10.h // 01100101-00000101-10010101-01010101
// CHECK-INST: bfminnm z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65059555 <unknown>
bfminnm z23.h, p3/m, z23.h, z13.h // 01100101-00000101-10001101-10110111
// CHECK-INST: bfminnm z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65058db7 <unknown>
bfminnm z31.h, p7/m, z31.h, z31.h // 01100101-00000101-10011111-11111111
// CHECK-INST: bfminnm z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65059fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmla.s b/llvm/test/MC/AArch64/SVE2p1/bfmla.s
index ff257830a13da9b2..a265eb8b71df9298 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmla.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmla.s
@@ -17,31 +17,31 @@ bfmla z23.h, z13.h, z0.h[5] // 01100100-01101000-00001001-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmla z23.h, z13.h, z0.h[5]
// CHECK-ENCODING: [0xb7,0x09,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 646809b7 <unknown>
bfmla z0.h, z0.h, z0.h[0] // 01100100-00100000-00001000-00000000
// CHECK-INST: bfmla z0.h, z0.h, z0.h[0]
// CHECK-ENCODING: [0x00,0x08,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64200800 <unknown>
bfmla z21.h, z10.h, z5.h[6] // 01100100-01110101-00001001-01010101
// CHECK-INST: bfmla z21.h, z10.h, z5.h[6]
// CHECK-ENCODING: [0x55,0x09,0x75,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64750955 <unknown>
bfmla z23.h, z13.h, z0.h[5] // 01100100-01101000-00001001-10110111
// CHECK-INST: bfmla z23.h, z13.h, z0.h[5]
// CHECK-ENCODING: [0xb7,0x09,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 646809b7 <unknown>
bfmla z31.h, z31.h, z7.h[7] // 01100100-01111111-00001011-11111111
// CHECK-INST: bfmla z31.h, z31.h, z7.h[7]
// CHECK-ENCODING: [0xff,0x0b,0x7f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 647f0bff <unknown>
@@ -50,7 +50,7 @@ bfmla z23.h, p3/m, z13.h, z8.h // 01100101-00101000-00001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfmla z23.h, p3/m, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x0d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65280db7 <unknown>
movprfx z23, z31
@@ -58,30 +58,30 @@ bfmla z23.h, p3/m, z13.h, z8.h // 01100101-00101000-00001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmla z23.h, p3/m, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x0d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65280db7 <unknown>
bfmla z0.h, p0/m, z0.h, z0.h // 01100101-00100000-00000000-00000000
// CHECK-INST: bfmla z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x00,0x20,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65200000 <unknown>
bfmla z21.h, p5/m, z10.h, z21.h // 01100101-00110101-00010101-01010101
// CHECK-INST: bfmla z21.h, p5/m, z10.h, z21.h
// CHECK-ENCODING: [0x55,0x15,0x35,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65351555 <unknown>
bfmla z23.h, p3/m, z13.h, z8.h // 01100101-00101000-00001101-10110111
// CHECK-INST: bfmla z23.h, p3/m, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x0d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65280db7 <unknown>
bfmla z31.h, p7/m, z31.h, z31.h // 01100101-00111111-00011111-11111111
// CHECK-INST: bfmla z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x1f,0x3f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 653f1fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmls.s b/llvm/test/MC/AArch64/SVE2p1/bfmls.s
index c153b56b9586b191..56713e74adf8f033 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmls.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmls.s
@@ -17,31 +17,31 @@ bfmls z23.h, z13.h, z0.h[5] // 01100100-01101000-00001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmls z23.h, z13.h, z0.h[5]
// CHECK-ENCODING: [0xb7,0x0d,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64680db7 <unknown>
bfmls z0.h, z0.h, z0.h[0] // 01100100-00100000-00001100-00000000
// CHECK-INST: bfmls z0.h, z0.h, z0.h[0]
// CHECK-ENCODING: [0x00,0x0c,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64200c00 <unknown>
bfmls z21.h, z10.h, z5.h[6] // 01100100-01110101-00001101-01010101
// CHECK-INST: bfmls z21.h, z10.h, z5.h[6]
// CHECK-ENCODING: [0x55,0x0d,0x75,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64750d55 <unknown>
bfmls z23.h, z13.h, z0.h[5] // 01100100-01101000-00001101-10110111
// CHECK-INST: bfmls z23.h, z13.h, z0.h[5]
// CHECK-ENCODING: [0xb7,0x0d,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64680db7 <unknown>
bfmls z31.h, z31.h, z7.h[7] // 01100100-01111111-00001111-11111111
// CHECK-INST: bfmls z31.h, z31.h, z7.h[7]
// CHECK-ENCODING: [0xff,0x0f,0x7f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 647f0fff <unknown>
@@ -50,7 +50,7 @@ bfmls z23.h, p3/m, z13.h, z8.h // 01100101-00101000-00101101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfmls z23.h, p3/m, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x2d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65282db7 <unknown>
movprfx z23, z31
@@ -58,30 +58,30 @@ bfmls z23.h, p3/m, z13.h, z8.h // 01100101-00101000-00101101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmls z23.h, p3/m, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x2d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65282db7 <unknown>
bfmls z0.h, p0/m, z0.h, z0.h // 01100101-00100000-00100000-00000000
// CHECK-INST: bfmls z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x20,0x20,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65202000 <unknown>
bfmls z21.h, p5/m, z10.h, z21.h // 01100101-00110101-00110101-01010101
// CHECK-INST: bfmls z21.h, p5/m, z10.h, z21.h
// CHECK-ENCODING: [0x55,0x35,0x35,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65353555 <unknown>
bfmls z23.h, p3/m, z13.h, z8.h // 01100101-00101000-00101101-10110111
// CHECK-INST: bfmls z23.h, p3/m, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x2d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65282db7 <unknown>
bfmls z31.h, p7/m, z31.h, z31.h // 01100101-00111111-00111111-11111111
// CHECK-INST: bfmls z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x3f,0x3f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 653f3fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmul.s b/llvm/test/MC/AArch64/SVE2p1/bfmul.s
index e0b93bcbb1035042..62e7d892468b6f48 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmul.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmul.s
@@ -14,25 +14,25 @@
bfmul z0.h, z0.h, z0.h[0] // 01100100-00100000-00101000-00000000
// CHECK-INST: bfmul z0.h, z0.h, z0.h[0]
// CHECK-ENCODING: [0x00,0x28,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64202800 <unknown>
bfmul z21.h, z10.h, z5.h[6] // 01100100-01110101-00101001-01010101
// CHECK-INST: bfmul z21.h, z10.h, z5.h[6]
// CHECK-ENCODING: [0x55,0x29,0x75,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 64752955 <unknown>
bfmul z23.h, z13.h, z0.h[5] // 01100100-01101000-00101001-10110111
// CHECK-INST: bfmul z23.h, z13.h, z0.h[5]
// CHECK-ENCODING: [0xb7,0x29,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 646829b7 <unknown>
bfmul z31.h, z31.h, z7.h[7] // 01100100-01111111-00101011-11111111
// CHECK-INST: bfmul z31.h, z31.h, z7.h[7]
// CHECK-ENCODING: [0xff,0x2b,0x7f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 647f2bff <unknown>
movprfx z23.h, p3/m, z31.h
@@ -40,7 +40,7 @@ bfmul z23.h, p3/m, z23.h, z13.h // 01100101-00000010-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfmul z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65028db7 <unknown>
movprfx z23, z31
@@ -48,54 +48,54 @@ bfmul z23.h, p3/m, z23.h, z13.h // 01100101-00000010-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfmul z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65028db7 <unknown>
bfmul z0.h, p0/m, z0.h, z0.h // 01100101-00000010-10000000-00000000
// CHECK-INST: bfmul z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65028000 <unknown>
bfmul z21.h, p5/m, z21.h, z10.h // 01100101-00000010-10010101-01010101
// CHECK-INST: bfmul z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65029555 <unknown>
bfmul z23.h, p3/m, z23.h, z13.h // 01100101-00000010-10001101-10110111
// CHECK-INST: bfmul z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65028db7 <unknown>
bfmul z31.h, p7/m, z31.h, z31.h // 01100101-00000010-10011111-11111111
// CHECK-INST: bfmul z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65029fff <unknown>
bfmul z0.h, z0.h, z0.h // 01100101-00000000-00001000-00000000
// CHECK-INST: bfmul z0.h, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x08,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65000800 <unknown>
bfmul z21.h, z10.h, z21.h // 01100101-00010101-00001001-01010101
// CHECK-INST: bfmul z21.h, z10.h, z21.h
// CHECK-ENCODING: [0x55,0x09,0x15,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65150955 <unknown>
bfmul z23.h, z13.h, z8.h // 01100101-00001000-00001001-10110111
// CHECK-INST: bfmul z23.h, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x09,0x08,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 650809b7 <unknown>
bfmul z31.h, z31.h, z31.h // 01100101-00011111-00001011-11111111
// CHECK-INST: bfmul z31.h, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x0b,0x1f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 651f0bff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfsub.s b/llvm/test/MC/AArch64/SVE2p1/bfsub.s
index 42cb6772c3a517bc..66590a72ed6b4c85 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfsub.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfsub.s
@@ -16,7 +16,7 @@ bfsub z23.h, p3/m, z23.h, z13.h // 01100101-00000001-10001101-10110111
// CHECK-INST: movprfx z23.h, p3/m, z31.h
// CHECK-INST: bfsub z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65018db7 <unknown>
movprfx z23, z31
@@ -24,53 +24,53 @@ bfsub z23.h, p3/m, z23.h, z13.h // 01100101-00000001-10001101-10110111
// CHECK-INST: movprfx z23, z31
// CHECK-INST: bfsub z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65018db7 <unknown>
bfsub z0.h, p0/m, z0.h, z0.h // 01100101-00000001-10000000-00000000
// CHECK-INST: bfsub z0.h, p0/m, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x80,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65018000 <unknown>
bfsub z21.h, p5/m, z21.h, z10.h // 01100101-00000001-10010101-01010101
// CHECK-INST: bfsub z21.h, p5/m, z21.h, z10.h
// CHECK-ENCODING: [0x55,0x95,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65019555 <unknown>
bfsub z23.h, p3/m, z23.h, z13.h // 01100101-00000001-10001101-10110111
// CHECK-INST: bfsub z23.h, p3/m, z23.h, z13.h
// CHECK-ENCODING: [0xb7,0x8d,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65018db7 <unknown>
bfsub z31.h, p7/m, z31.h, z31.h // 01100101-00000001-10011111-11111111
// CHECK-INST: bfsub z31.h, p7/m, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x9f,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65019fff <unknown>
bfsub z0.h, z0.h, z0.h // 01100101-00000000-00000100-00000000
// CHECK-INST: bfsub z0.h, z0.h, z0.h
// CHECK-ENCODING: [0x00,0x04,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65000400 <unknown>
bfsub z21.h, z10.h, z21.h // 01100101-00010101-00000101-01010101
// CHECK-INST: bfsub z21.h, z10.h, z21.h
// CHECK-ENCODING: [0x55,0x05,0x15,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 65150555 <unknown>
bfsub z23.h, z13.h, z8.h // 01100101-00001000-00000101-10110111
// CHECK-INST: bfsub z23.h, z13.h, z8.h
// CHECK-ENCODING: [0xb7,0x05,0x08,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 650805b7 <unknown>
bfsub z31.h, z31.h, z31.h // 01100101-00011111-00000111-11111111
// CHECK-INST: bfsub z31.h, z31.h, z31.h
// CHECK-ENCODING: [0xff,0x07,0x1f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
// CHECK-UNKNOWN: 651f07ff <unknown>
>From b68b62120eb9d7b3be67a013be500795a81e612e Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Wed, 22 Nov 2023 10:03:50 +0000
Subject: [PATCH 2/2] [Clang][AArch64] Add fix vector types to header into SVE
This patch is needed for the reduction instructions in sve2.1
It add ta new header to sve with all the fixed vector types.
The new types are only added if neon is not declared.
---
clang/include/clang/Basic/arm_vector_type.td | 13 ++
clang/lib/Headers/CMakeLists.txt | 3 +
.../CodeGen/arm-vector_type-params-returns.c | 113 ++++++++++++++++++
clang/utils/TableGen/NeonEmitter.cpp | 44 +++++++
clang/utils/TableGen/SveEmitter.cpp | 2 +
clang/utils/TableGen/TableGen.cpp | 15 ++-
clang/utils/TableGen/TableGenBackends.h | 1 +
7 files changed, 188 insertions(+), 3 deletions(-)
create mode 100644 clang/include/clang/Basic/arm_vector_type.td
create mode 100644 clang/test/CodeGen/arm-vector_type-params-returns.c
diff --git a/clang/include/clang/Basic/arm_vector_type.td b/clang/include/clang/Basic/arm_vector_type.td
new file mode 100644
index 0000000000000000..5018b0cdfc137850
--- /dev/null
+++ b/clang/include/clang/Basic/arm_vector_type.td
@@ -0,0 +1,13 @@
+//===--- arm_vector_type.td - ARM Fixed vector types compiler interface ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TableGen definitions from which the ARM BF16 header
+// file will be generated.
+//
+//===----------------------------------------------------------------------===//
+include "arm_neon_incl.td"
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 8b1e2bc4afa4dcd0..0beb6ade42920455 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -385,6 +385,8 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD)
clang_generate_header(-gen-arm-mve-header arm_mve.td arm_mve.h)
# Generate arm_cde.h
clang_generate_header(-gen-arm-cde-header arm_cde.td arm_cde.h)
+ # Generate arm_vector_type.h
+ clang_generate_header(-gen-arm-vector-type arm_vector_type.td arm_vector_type.h)
# Add headers to target specific lists
list(APPEND arm_common_generated_files
@@ -401,6 +403,7 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD)
"${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h"
"${CMAKE_CURRENT_BINARY_DIR}/arm_sme_draft_spec_subject_to_change.h"
"${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h"
+ "${CMAKE_CURRENT_BINARY_DIR}/arm_vector_type.h"
)
endif()
if(RISCV IN_LIST LLVM_TARGETS_TO_BUILD)
diff --git a/clang/test/CodeGen/arm-vector_type-params-returns.c b/clang/test/CodeGen/arm-vector_type-params-returns.c
new file mode 100644
index 0000000000000000..48c19d01b6257cc5
--- /dev/null
+++ b/clang/test/CodeGen/arm-vector_type-params-returns.c
@@ -0,0 +1,113 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+#include <arm_sve.h>
+
+// function return types
+// CHECK-LABEL: define dso_local <8 x half> @test_ret_v8f16(
+// CHECK-SAME: <8 x half> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <8 x half> [[V]]
+//
+float16x8_t test_ret_v8f16(float16x8_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_ret_v4f32(
+// CHECK-SAME: <4 x float> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <4 x float> [[V]]
+//
+float32x4_t test_ret_v4f32(float32x4_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <2 x double> @test_ret_v2f64(
+// CHECK-SAME: <2 x double> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <2 x double> [[V]]
+//
+float64x2_t test_ret_v2f64(float64x2_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_ret_v8bf16(
+// CHECK-SAME: <8 x bfloat> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <8 x bfloat> [[V]]
+//
+bfloat16x8_t test_ret_v8bf16(bfloat16x8_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_ret_v16s8(
+// CHECK-SAME: <16 x i8> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <16 x i8> [[V]]
+//
+int8x16_t test_ret_v16s8(int8x16_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_ret_v8s16(
+// CHECK-SAME: <8 x i16> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <8 x i16> [[V]]
+//
+int16x8_t test_ret_v8s16(int16x8_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <4 x i32> @test_ret_v32s4(
+// CHECK-SAME: <4 x i32> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <4 x i32> [[V]]
+//
+int32x4_t test_ret_v32s4(int32x4_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <2 x i64> @test_ret_v64s2(
+// CHECK-SAME: <2 x i64> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <2 x i64> [[V]]
+//
+int64x2_t test_ret_v64s2(int64x2_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_ret_v16u8(
+// CHECK-SAME: <16 x i8> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <16 x i8> [[V]]
+//
+uint8x16_t test_ret_v16u8(uint8x16_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_ret_v8u16(
+// CHECK-SAME: <8 x i16> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <8 x i16> [[V]]
+//
+uint16x8_t test_ret_v8u16(uint16x8_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <4 x i32> @test_ret_v32u4(
+// CHECK-SAME: <4 x i32> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <4 x i32> [[V]]
+//
+uint32x4_t test_ret_v32u4(uint32x4_t v) {
+ return v;
+}
+
+// CHECK-LABEL: define dso_local <2 x i64> @test_ret_v64u2(
+// CHECK-SAME: <2 x i64> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret <2 x i64> [[V]]
+//
+uint64x2_t test_ret_v64u2(uint64x2_t v) {
+ return v;
+}
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 4b112972a1ec9819..2aefb56b08eb379a 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -593,6 +593,8 @@ class NeonEmitter {
// Emit arm_bf16.h.inc
void runBF16(raw_ostream &o);
+ void runVectorType(raw_ostream &o);
+
// Emit all the __builtin prototypes used in arm_neon.h, arm_fp16.h and
// arm_bf16.h
void runHeader(raw_ostream &o);
@@ -2546,6 +2548,44 @@ void NeonEmitter::runFP16(raw_ostream &OS) {
OS << "#endif /* __ARM_FP16_H */\n";
}
+void NeonEmitter::runVectorType(raw_ostream &OS) {
+ OS << "/*===---- arm_vector_type - ARM vector type "
+ "------===\n"
+ " *\n"
+ " *\n"
+ " * Part of the LLVM Project, under the Apache License v2.0 with LLVM "
+ "Exceptions.\n"
+ " * See https://llvm.org/LICENSE.txt for license information.\n"
+ " * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception\n"
+ " *\n"
+ " *===-----------------------------------------------------------------"
+ "------===\n"
+ " */\n\n";
+ OS << "#ifndef __ARM_NEON_TYPES_H\n";
+ OS << "#define __ARM_NEON_TYPES_H\n";
+ OS << "#ifdef __cplusplus\n";
+ OS << "extern \"C\" {\n";
+ OS << "#endif\n";
+ OS << "#ifndef __ARM_NEON_H\n";
+
+ std::string TypedefTypes("QcQsQiQlQUcQUsQUiQUlQhQfQdQb");
+ std::vector<TypeSpec> TDTypeVec = TypeSpec::fromTypeSpecs(TypedefTypes);
+ for (auto &TS : TDTypeVec) {
+ Type T(TS, ".");
+ OS << "typedef __attribute__((vector_size(16))) ";
+
+ Type T2 = T;
+ T2.makeScalar();
+ OS << T2.str();
+ OS << " " << T.str() << ";\n";
+ }
+ OS << "#endif\n";
+ OS << "#ifdef __cplusplus\n";
+ OS << "} // extern \"C\"\n";
+ OS << "#endif\n";
+ OS << "#endif //__ARM_NEON_TYPES_H\n";
+}
+
void NeonEmitter::runBF16(raw_ostream &OS) {
OS << "/*===---- arm_bf16.h - ARM BF16 intrinsics "
"-----------------------------------===\n"
@@ -2640,6 +2680,10 @@ void clang::EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
NeonEmitter(Records).runHeader(OS);
}
+void clang::EmitVectorType(RecordKeeper &Records, raw_ostream &OS) {
+ NeonEmitter(Records).runVectorType(OS);
+}
+
void clang::EmitNeonTest(RecordKeeper &Records, raw_ostream &OS) {
llvm_unreachable("Neon test generation no longer implemented!");
}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index d00989ac0f3beb55..3274a25c769bd349 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1280,6 +1280,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
OS << "typedef __SVBfloat16_t svbfloat16_t;\n";
OS << "#include <arm_bf16.h>\n";
+ OS << "#include <arm_vector_type.h>\n";
OS << "typedef __SVFloat32_t svfloat32_t;\n";
OS << "typedef __SVFloat64_t svfloat64_t;\n";
@@ -1724,4 +1725,5 @@ void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
SVEEmitter(Records).createSMERangeChecks(OS);
}
+
} // End namespace clang
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 7efb6c731d3e5ee2..66008ae0c2e3c14b 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -73,6 +73,7 @@ enum ActionType {
GenArmNeon,
GenArmFP16,
GenArmBF16,
+ GenArmVectorType,
GenArmNeonSema,
GenArmNeonTest,
GenArmMveHeader,
@@ -229,6 +230,8 @@ cl::opt<ActionType> Action(
clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"),
clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"),
clEnumValN(GenArmBF16, "gen-arm-bf16", "Generate arm_bf16.h for clang"),
+ clEnumValN(GenArmVectorType, "gen-arm-vector-type",
+ "Generate arm_vector_type.h for clang"),
clEnumValN(GenArmNeonSema, "gen-arm-neon-sema",
"Generate ARM NEON sema support for clang"),
clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
@@ -279,11 +282,14 @@ cl::opt<ActionType> Action(
"Generate riscv_vector_builtin_cg.inc for clang"),
clEnumValN(GenRISCVVectorBuiltinSema, "gen-riscv-vector-builtin-sema",
"Generate riscv_vector_builtin_sema.inc for clang"),
- clEnumValN(GenRISCVSiFiveVectorBuiltins, "gen-riscv-sifive-vector-builtins",
+ clEnumValN(GenRISCVSiFiveVectorBuiltins,
+ "gen-riscv-sifive-vector-builtins",
"Generate riscv_sifive_vector_builtins.inc for clang"),
- clEnumValN(GenRISCVSiFiveVectorBuiltinCG, "gen-riscv-sifive-vector-builtin-codegen",
+ clEnumValN(GenRISCVSiFiveVectorBuiltinCG,
+ "gen-riscv-sifive-vector-builtin-codegen",
"Generate riscv_sifive_vector_builtin_cg.inc for clang"),
- clEnumValN(GenRISCVSiFiveVectorBuiltinSema, "gen-riscv-sifive-vector-builtin-sema",
+ clEnumValN(GenRISCVSiFiveVectorBuiltinSema,
+ "gen-riscv-sifive-vector-builtin-sema",
"Generate riscv_sifive_vector_builtin_sema.inc for clang"),
clEnumValN(GenAttrDocs, "gen-attr-docs",
"Generate attribute documentation"),
@@ -449,6 +455,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
case GenArmFP16:
EmitFP16(Records, OS);
break;
+ case GenArmVectorType:
+ EmitVectorType(Records, OS);
+ break;
case GenArmBF16:
EmitBF16(Records, OS);
break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index d8f447069376bca3..b5bf0b56043a8b71 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -97,6 +97,7 @@ void EmitNeon(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitFP16(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitBF16(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitNeonSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitVectorType(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitNeonTest(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitSveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
More information about the cfe-commits
mailing list