[clang] [SVE2.1][Clang][LLVM]Add BFloat16 builtin in Clang and LLVM intrinisc (PR #70362)

Thu Oct 26 11:09:56 PDT 2023

https://github.com/CarolineConcatto created https://github.com/llvm/llvm-project/pull/70362

This patch implements the builtins in Clang
and the LLVM-IR intrinsic for the following:

For BFADD , BFSUB, BFMAX, BFMIN, BFMAXNM and BFMINNM, for instance: 
svbfloat16_t svadd[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); 
svbfloat16_t svadd[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); 
svbfloat16_t svadd[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); 
svbfloat16_t svadd[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); 
svbfloat16_t svadd[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); 
svbfloat16_t svadd[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); 
the add, could be replaced by sub, max, min, maxnm and minnm.

For BFMUL:
svbfloat16_t svmul[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); 
svbfloat16_t svmul[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); 
svbfloat16_t svmul[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); 
svbfloat16_t svmul[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); 
svbfloat16_t svmul[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); 
svbfloat16_t svmul[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);

svbfloat16_t svmul_lane[_bf16](svbfloat16_t zn, svbfloat16_t zm,
                               uint64_t imm_idx);

For BFCLAMP:
svbfloat16_t svclamp[_bf16](svbfloat16_t op, svbfloat16_t min, svbfloat16_t max);

For BFMLA and BFMLS
svbfloat16_t svmla[_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                            svbfloat16_t zm);
svbfloat16_t svmla[_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                            svbfloat16_t zm);
svbfloat16_t svmla[_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                            svbfloat16_t zm);
svbfloat16_t svmla[_n_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                              bfloat16_t zm);
svbfloat16_t svmla[_n_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                              bfloat16_t zm);
svbfloat16_t svmla[_n_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                              bfloat16_t zm);

svbfloat16_t svmla_lane[_bf16](svbfloat16_t zda, svbfloat16_t zn,
                               svbfloat16_t zm, uint64_t

According to the PR#257[1]
[1]ARM-software/acle#257

Co-author: Matthew Devereau <matthew.devereau at arm.com>

>From 22e54d4611092ddf092c365372fa982cddbcacb6 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Thu, 26 Oct 2023 17:48:58 +0000
Subject: [PATCH] [SVE2.1][Clang][LLVM]Add BFloat16 builtin in Clang and LLVM
 intrinisc

This patch implements the builtins in Clang
and the LLVM-IR intrinsic for the following:

For BFADD , BFSUB, BFMAX, BFMIN, BFMAXNM and BFMINNM, for instance:
svbfloat16_t svadd[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm);
svbfloat16_t svadd[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm);
svbfloat16_t svadd[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm);
svbfloat16_t svadd[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);
svbfloat16_t svadd[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);
svbfloat16_t svadd[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);
the add, could be replaced by sub, max, min, maxnm and minnm.

For BFMUL:
svbfloat16_t svmul[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm);
svbfloat16_t svmul[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm);
svbfloat16_t svmul[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm);
svbfloat16_t svmul[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);
svbfloat16_t svmul[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);
svbfloat16_t svmul[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm);

svbfloat16_t svmul_lane[_bf16](svbfloat16_t zn, svbfloat16_t zm,
                               uint64_t imm_idx);

For BFCLAMP:
svbfloat16_t svclamp[_bf16](svbfloat16_t op, svbfloat16_t min, svbfloat16_t max);

For BFMLA and BFMLS
svbfloat16_t svmla[_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                            svbfloat16_t zm);
svbfloat16_t svmla[_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                            svbfloat16_t zm);
svbfloat16_t svmla[_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                            svbfloat16_t zm);
svbfloat16_t svmla[_n_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                              bfloat16_t zm);
svbfloat16_t svmla[_n_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                              bfloat16_t zm);
svbfloat16_t svmla[_n_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn,
                              bfloat16_t zm);

svbfloat16_t svmla_lane[_bf16](svbfloat16_t zda, svbfloat16_t zn,
                               svbfloat16_t zm, uint64_t

According to the PR#257[1]
[1]ARM-software/acle#257

Co-author: Matthew Devereau <matthew.devereau at arm.com>
---
 clang/include/clang/Basic/arm_sve.td          |  16 +++
 .../acle_sve2p1_bfadd.c                       | 133 +++++++++++++++++
 .../acle_sve2p1_bfclamp.c                     |  31 ++++
 .../acle_sve2p1_bfmax.c                       | 134 ++++++++++++++++++
 .../acle_sve2p1_bfmaxnm.c                     | 134 ++++++++++++++++++
 .../acle_sve2p1_bfmin.c                       | 134 ++++++++++++++++++
 .../acle_sve2p1_bfminnm.c                     | 134 ++++++++++++++++++
 .../acle_sve2p1_bfmla.c                       | 133 +++++++++++++++++
 .../acle_sve2p1_bfmla_lane.c                  |  60 ++++++++
 .../acle_sve2p1_bfmls.c                       | 133 +++++++++++++++++
 .../acle_sve2p1_bfmls_lane.c                  |  60 ++++++++
 .../acle_sve2p1_bfmul.c                       | 134 ++++++++++++++++++
 .../acle_sve2p1_bfmul_lane.c                  |  61 ++++++++
 .../acle_sve2p1_bfsub.c                       | 134 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64.td            |   8 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  62 +++++---
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  58 +++++++-
 .../AArch64/sve2p1-intrinsics-bfadd.ll        |  62 ++++++++
 .../AArch64/sve2p1-intrinsics-bfclamp.ll      |  13 ++
 .../AArch64/sve2p1-intrinsics-bfmax.ll        |  74 ++++++++++
 .../AArch64/sve2p1-intrinsics-bfmaxnm.ll      |  74 ++++++++++
 .../AArch64/sve2p1-intrinsics-bfmin.ll        |  74 ++++++++++
 .../AArch64/sve2p1-intrinsics-bfminnm.ll      |  74 ++++++++++
 .../AArch64/sve2p1-intrinsics-bfmla.ll        |  35 +++++
 .../AArch64/sve2p1-intrinsics-bfmla_lane.ll   |  31 ++++
 .../AArch64/sve2p1-intrinsics-bfmls.ll        |  49 +++----
 .../AArch64/sve2p1-intrinsics-bfmls_lane.ll   |  31 ++++
 .../AArch64/sve2p1-intrinsics-bfmlsl.ll       |  43 ++++++
 .../AArch64/sve2p1-intrinsics-bfmul.ll        |  62 ++++++++
 .../AArch64/sve2p1-intrinsics-bfmul_lane.ll   |  37 +++++
 .../AArch64/sve2p1-intrinsics-bfsub.ll        |  62 ++++++++
 llvm/test/MC/AArch64/SVE2p1/bfadd.s           |  20 +--
 llvm/test/MC/AArch64/SVE2p1/bfclamp.s         |  10 +-
 llvm/test/MC/AArch64/SVE2p1/bfmax.s           |  12 +-
 llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s         |  12 +-
 llvm/test/MC/AArch64/SVE2p1/bfmin.s           |  12 +-
 llvm/test/MC/AArch64/SVE2p1/bfminnm.s         |  12 +-
 llvm/test/MC/AArch64/SVE2p1/bfmla.s           |  22 +--
 llvm/test/MC/AArch64/SVE2p1/bfmls.s           |  22 +--
 llvm/test/MC/AArch64/SVE2p1/bfmul.s           |  28 ++--
 llvm/test/MC/AArch64/SVE2p1/bfsub.s           |  20 +--
 41 files changed, 2314 insertions(+), 136 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
 create mode 100644 clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b5baafedd139602..b73e1a997bf964b 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1980,3 +1980,19 @@ def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sv
 
 defm SVREVD : SInstZPZ<"svrevd", "csilUcUsUiUl", "aarch64_sve_revd">;
 }
+
+let TargetGuard = "sve2p1,b16b16" in {
+defm SVMUL_BF  : SInstZPZZ<"svmul",  "b", "aarch64_sve_fmul",   "aarch64_sve_fmul_u">;
+defm SVADD_BF  : SInstZPZZ<"svadd",  "b", "aarch64_sve_fadd",   "aarch64_sve_fadd_u">;
+defm SVSUB_BF  : SInstZPZZ<"svsub",  "b", "aarch64_sve_fsub",   "aarch64_sve_fsub_u">;
+defm SVMAXNM_BF  : SInstZPZZ<"svmaxnm","b", "aarch64_sve_fmaxnm", "aarch64_sve_fmaxnm_u">;
+defm SVMINNM_BF  : SInstZPZZ<"svminnm","b", "aarch64_sve_fminnm", "aarch64_sve_fminnm_u">;
+defm SVMAX_BF    : SInstZPZZ<"svmax",  "b", "aarch64_sve_fmax",   "aarch64_sve_fmax_u">;
+defm SVMIN_BF    : SInstZPZZ<"svmin",  "b", "aarch64_sve_fmin",   "aarch64_sve_fmin_u">;
+defm SVMLA_BF  : SInstZPZZZ<"svmla",  "b", "aarch64_sve_fmla",  "aarch64_sve_fmla_u", []>;
+defm SVMLS_BF  : SInstZPZZZ<"svmls",  "b", "aarch64_sve_fmls",  "aarch64_sve_fmls_u", []>;
+def SVMLA_LANE_BF  : SInst<"svmla_lane[_{d}]",  "ddddi",  "b", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLS_LANE_BF  : SInst<"svmls_lane[_{d}]",  "ddddi",  "b", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMUL_LANE_BF  : SInst<"svmul_lane[_{d}]", "dddi", "b", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVFCLAMP_BF   : SInst<"svclamp[_{d}]", "dddd", "b", MergeNone, "aarch64_sve_fclamp", [], []>;
+} //sve2p1,b16b16
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
new file mode 100644
index 000000000000000..63a5f09acb419eb
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
@@ -0,0 +1,133 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svadd_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svadd, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svadd, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svadd_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svadd, _bf16, _x)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svadd, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svadd_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svadd, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svadd_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svadd_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svadd_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svadd, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
new file mode 100644
index 000000000000000..42f6cca9fe86542
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfclamp.c
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svclamp_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svclamp_bf16u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svclamp_bf16(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svclamp, _bf16,)(op1, op2, op3);
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
new file mode 100644
index 000000000000000..bcd480540e36ed8
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmax_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmax, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmax, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmax_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmax, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmax_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmax, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmax_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmax, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmax_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmax_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmax_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmax, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
new file mode 100644
index 000000000000000..f52fd8606c837b6
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmaxnm_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmaxnm, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmaxnm, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmaxnm_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmaxnm, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmaxnm_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmaxnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmaxnm_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svmaxnm_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmaxnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
new file mode 100644
index 000000000000000..26e3ea7d38f2c2b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmin_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmin, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmin, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmin_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmin, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmin_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmin, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmin_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmin, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmin_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmin_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmin_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmin, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
new file mode 100644
index 000000000000000..b5d77bb0cb638e1
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svminnm_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svminnm, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svminnm, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svminnm_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svminnm, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svminnm_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svminnm, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svminnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svminnm, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svminnm_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svminnm_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svminnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svminnm, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
new file mode 100644
index 000000000000000..fe244a8dd881a01
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
@@ -0,0 +1,133 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmla_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla, _bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmla_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla, _bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmla_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla, _bf16, _x)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_n_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla, _n_bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_n_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmla_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla, _n_bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmla_n_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmla_n_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmla_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla, _n_bf16, _x)(pg, op1, op2, op3);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
new file mode 100644
index 000000000000000..9de32cdfc2e665e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla_lane.c
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmla_lane_bf16_idx1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmla_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla_lane, _bf16,)(op1, op2, op3, 1);
+}
+
+// CHECK-LABEL: @test_svmla_lane_bf16_idx3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmla_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla_lane ,_bf16,)(op1, op2, op3, 3);
+}
+
+// CHECK-LABEL: @test_svmla_lane_bf16_idx7(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmla_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmla_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmla_lane, _bf16,)(op1, op2, op3, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
new file mode 100644
index 000000000000000..3cb3d3d9df47a56
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
@@ -0,0 +1,133 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmls_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls, _bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmls_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls, _bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmls_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls, _bf16, _x)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_n_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls, _n_bf16, _m)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_n_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmls_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls, _n_bf16, _z)(pg, op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svmls_n_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmls_n_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP3:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmls_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls, _n_bf16, _x)(pg, op1, op2, op3);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
new file mode 100644
index 000000000000000..89fd47727157ba9
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls_lane.c
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmls_lane_bf16_idx1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmls_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 1);
+}
+
+// CHECK-LABEL: @test_svmls_lane_bf16_idx3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmls_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 3);
+}
+
+// CHECK-LABEL: @test_svmls_lane_bf16_idx7(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmls_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmls_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3)
+{
+  return SVE_ACLE_FUNC(svmls_lane, _bf16,)(op1, op2, op3, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
new file mode 100644
index 000000000000000..5e3521500c0bb8e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmul_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svmul_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svmul_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svmul_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmul, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svmul_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svmul_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svmul_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svmul, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
new file mode 100644
index 000000000000000..3464994bed85745
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul_lane.c
@@ -0,0 +1,61 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -target-feature +sve -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svmul_lane_bf16_idx1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx1u14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmul_lane_bf16_idx1(svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 1);
+}
+
+// CHECK-LABEL: @test_svmul_lane_bf16_idx3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx3u14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmul_lane_bf16_idx3(svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 3);
+}
+
+// CHECK-LABEL: @test_svmul_lane_bf16_idx7(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svmul_lane_bf16_idx7u14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svmul_lane_bf16_idx7(svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svmul_lane, _bf16, )(op1, op2, 7);
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
new file mode 100644
index 000000000000000..3675fd1d5ad7fe0
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svsub_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_mu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svsub, _bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_zu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svsub, _bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svsub_bf16_xu10__SVBool_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svsub, _bf16, _x)(pg, op1, op2);
+}
+
+
+// CHECK-LABEL: @test_svsub_bf16_n_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_mu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+  return SVE_ACLE_FUNC(svsub, _n_bf16, _m)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_n_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_zu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svsub_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svsub, _n_bf16, _z)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svsub_bf16_n_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svsub_bf16_n_xu10__SVBool_tu14__SVBFloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsub_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
+{
+   return SVE_ACLE_FUNC(svsub, _n_bf16, _x)(pg, op1, op2);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index ced1d4389203653..1688b1fc801a2d4 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -151,6 +151,9 @@ def FeatureExperimentalZeroingPseudos
 def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
+def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
+    "true", "Enable BFloat16 Extension (FEAT_BF16)" >;
+
 def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
   "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
 
@@ -172,7 +175,7 @@ def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "tru
   "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>;
 
 def FeatureSVE2p1: SubtargetFeature<"sve2p1", "HasSVE2p1", "true",
-  "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
+  "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2, FeatureBF16]>;
 
 def FeatureB16B16 : SubtargetFeature<"b16b16", "HasB16B16", "true",
   "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", []>;
@@ -444,9 +447,6 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "true", "Use an instruction sequence for taking the address of a global "
     "that allows a memory tag in the upper address bits">;
 
-def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
-    "true", "Enable BFloat16 Extension (FEAT_BF16)" >;
-
 def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
     "true", "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d599ac4689e5cb3..c39dc8452bf0b3a 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3945,28 +3945,56 @@ def : InstAlias<"pfalse\t$Pd", (PFALSE PNRasPPR8:$Pd), 0>;
 // SVE2.1 non-widening BFloat16 to BFloat16 instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasSVE2p1_or_HasSME2p1, HasB16B16] in {
-def BFADD_ZZZ : sve_fp_3op_u_zd<0b00, 0b000, "bfadd", ZPR16>;
-def BFSUB_ZZZ : sve_fp_3op_u_zd<0b00, 0b001, "bfsub", ZPR16>;
-def BFMUL_ZZZ : sve_fp_3op_u_zd<0b00, 0b010, "bfmul", ZPR16>;
+let Predicates = [HasSVE2p1, HasB16B16, UseExperimentalZeroingPseudos] in {
+defm BFADD_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fadd>;
+defm BFSUB_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fsub>;
+defm BFMUL_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmul>;
+defm BFMAXNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmaxnm>;
+defm BFMINNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fminnm>;
+defm BFMIN_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmin>;
+defm BFMAX_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmax>;
+} //HasSVE2p1_or_HasSME2p1, HasB16B16, UseExperimentalZeroingPseudos
 
-def BFMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b00, "bfmla", ZPR16>;
-def BFMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b00, 0b01, "bfmls", ZPR16>;
+let Predicates = [HasSVE2p1, HasB16B16] in {
 
-def BFADD_ZPZmZ   : sve_fp_2op_p_zds<0b00, 0b0000, "bfadd", ZPR16>;
-def BFSUB_ZPZmZ   : sve_fp_2op_p_zds<0b00, 0b0001, "bfsub", ZPR16>;
-def BFMUL_ZPZmZ   : sve_fp_2op_p_zds<0b00, 0b0010, "bfmul", ZPR16>;
-def BFMAXNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0100, "bfmaxnm", ZPR16>;
-def BFMINNM_ZPZmZ : sve_fp_2op_p_zds<0b00, 0b0101, "bfminnm", ZPR16>;
-def BFMAX_ZPZmZ   : sve_fp_2op_p_zds<0b00, 0b0110, "bfmax", ZPR16>;
-def BFMIN_ZPZmZ   : sve_fp_2op_p_zds<0b00, 0b0111, "bfmin", ZPR16>;
+defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>;
+defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>;
 
-defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10>;
-defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11>;
+defm BFMLA_ZPZZZ : sve_fp_3op_pred_bf<AArch64fmla_p>;
+defm BFMLS_ZPZZZ : sve_fp_3op_pred_bf<AArch64fmls_p>;
 
-defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul">;
+defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10, int_aarch64_sve_fmla_lane>;
+defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11, int_aarch64_sve_fmls_lane>;
 
-def BFCLAMP_ZZZ : sve2p1_fclamp<"bfclamp", 0b00, ZPR16>;
+defm BFADD_ZPmZZ : sve2p1_bf_2op_p_zds<0b0000, "bfadd", "BFADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
+defm BFSUB_ZPmZZ : sve2p1_bf_2op_p_zds<0b0001, "bfsub", "BFSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryComm>;
+defm BFMUL_ZPmZZ : sve2p1_bf_2op_p_zds<0b0010, "bfmul", "BFMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;
+
+defm BFADD_ZZZ : sve2p1_bf_3op_u_zd<0b000, "bfadd", fadd, AArch64fadd_p>;
+defm BFSUB_ZZZ : sve2p1_bf_3op_u_zd<0b001, "bfsub", fsub, AArch64fsub_p>;
+defm BFMUL_ZZZ : sve2p1_bf_3op_u_zd<0b010, "bfmul", fmul, AArch64fmul_p>;
+
+defm BFADD_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fadd_p>;
+defm BFSUB_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fsub_p>;
+defm BFMUL_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmul_p>;
+
+
+defm BFMAX_ZPmZZ : sve2p1_bf_2op_p_zds<0b0110, "bfmax", "BFMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
+defm BFMIN_ZPmZZ : sve2p1_bf_2op_p_zds<0b0111, "bfmin", "BFMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
+
+defm BFMAX_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmax_p>;
+defm BFMIN_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmin_p>;
+
+
+defm BFMAXNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0100, "bfmaxnm", "BFMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
+defm BFMINNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0101, "bfminnm", "BFMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
+
+defm BFMAXNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmaxnm_p>;
+defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
+
+defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
+
+defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", int_aarch64_sve_fclamp>;
 } // End HasSVE2p1_or_HasSME2p1, HasB16B16
 
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 7bb457d9188210c..0c985592c5949d4 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2118,6 +2118,29 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
   let mayRaiseFPException = 1;
 }
 
+multiclass sve2p1_bf_2op_p_zds<bits<4> opc, string asm, string Ps,
+                            SDPatternOperator op, DestructiveInstTypeEnum flags,
+                            string revname="", bit isReverseInstr=0> {
+let DestructiveInstType = flags in {
+  def NAME : sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>,
+           SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME , revname , isReverseInstr>;
+  }
+
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
+multiclass sve2p1_bf_bin_pred_zds<SDPatternOperator op> {
+  def _UNDEF : PredTwoOpPseudo<NAME, ZPR16, FalseLanesUndef>;
+
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1,  nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _UNDEF)>;
+}
+
+multiclass sve2p1_bf_2op_p_zds_zeroing<SDPatternOperator op> {
+  def _ZERO : PredTwoOpPseudo<NAME, ZPR16, FalseLanesZero>;
+
+  def : SVE_3_Op_Pat_SelZero<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _ZERO)>;
+}
+
 multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
                             SDPatternOperator op, DestructiveInstTypeEnum flags,
                             string revname="", bit isReverseInstr=0> {
@@ -2266,6 +2289,14 @@ multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op,
   def : SVE_2_Op_Pred_All_Active<nxv2f64, predicated_op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve2p1_bf_3op_u_zd<bits<3> opc1, string asm, SDPatternOperator op,
+                          SDPatternOperator predicated_op = null_frag> {
+  def NAME : sve_fp_3op_u_zd<0b00, opc1, asm, ZPR16>;
+  def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+
+  def : SVE_2_Op_Pred_All_Active<nxv8bf16, predicated_op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
 multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
   def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
   def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
@@ -2324,6 +2355,14 @@ multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, string Ps,
   def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve_fp_3op_p_zds_a_bf<bits<2> opc, string asm, string Ps,
+                              SDPatternOperator op> {
+  def NAME : sve_fp_3op_p_zds_a<0b00, opc, asm, ZPR16>,
+           SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME, "", 0>;
+
+  def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
 class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
                          ZPRRegOp zprty>
 : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
@@ -2391,7 +2430,7 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bits<2> opc, string asm,
   let mayRaiseFPException = 1;
 }
 
-multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc> {
+multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc, SDPatternOperator op> {
   def NAME : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16,
                                          VectorIndexH32b> {
     bits<3> Zm;
@@ -2400,6 +2439,8 @@ multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc> {
     let Inst{20-19} = iop{1-0};
     let Inst{18-16} = Zm;
   }
+  def : Pat<(nxv8bf16 (op nxv8bf16:$op1, nxv8bf16:$op2, nxv8bf16:$op3, (i32 VectorIndexH32b_timm:$idx))),
+            (!cast<Instruction>(NAME) $op1, $op2, $op3, VectorIndexH32b_timm:$idx)>;
 }
 
 multiclass sve_fp_fma_by_indexed_elem<bits<2> opc, string asm,
@@ -2456,7 +2497,7 @@ class sve_fp_fmul_by_indexed_elem<bits<2> sz, bit o2, string asm, ZPRRegOp zprty
   let mayRaiseFPException = 1;
 }
 
-multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm> {
+multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm, SDPatternOperator ir_intrinsic> {
   def NAME : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b1, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
@@ -2464,6 +2505,8 @@ multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm> {
     let Inst{20-19} = iop{1-0};
     let Inst{18-16} = Zm;
   }
+  def : Pat <(nxv8bf16 (ir_intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
+             (!cast<Instruction>(NAME) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
 }
 
 multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
@@ -9100,6 +9143,12 @@ multiclass sve_fp_3op_pred_hfd<SDPatternOperator op> {
   def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D_UNDEF)>;
 }
 
+multiclass sve_fp_3op_pred_bf<SDPatternOperator op> {
+  def _UNDEF : PredThreeOpPseudo<NAME, ZPR16, FalseLanesUndef>;
+
+  def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _UNDEF)>;
+}
+
 // Predicated pseudo integer two operand instructions.
 multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> {
   def _B_UNDEF : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
@@ -9185,6 +9234,11 @@ multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve2p1_bfclamp<string asm, SDPatternOperator op> {
+  def NAME : sve2p1_fclamp<asm, 0b00, ZPR16>;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+}
+
 // SVE two-way dot product
 class sve2p1_two_way_dot_vv<string mnemonic, bit u>
     : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll
new file mode 100644
index 000000000000000..221bb3b6045fb29
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfadd_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfadd_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfadd_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfadd_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfadd_u(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfadd_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfadd_u_ptrue(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfadd_u_ptrue:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfadd_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfadd_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
new file mode 100644
index 000000000000000..61b67755a35441e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfclamp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll
new file mode 100644
index 000000000000000..24c4fedb3426628
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmax_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmax_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmax_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax_u_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmax_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmax_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmax_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll
new file mode 100644
index 000000000000000..25fe9cf7243a4e4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmaxnm_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmaxnm_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm_u_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmaxnm_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmaxnm_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmaxnm_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll
new file mode 100644
index 000000000000000..d5b0b8be8b85ea9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmin_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmin_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmin_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin_u_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmin_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmin_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmin_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll
new file mode 100644
index 000000000000000..c019dc7cbe29104
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfminnm_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfminnm_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfminnm_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm_u_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfminnm_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfminnm_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfminnm_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll
new file mode 100644
index 000000000000000..02b1db13ea34f7c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmla_m(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_m:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_x(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_x:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_z(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_z:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z3.h
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a_z, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll
new file mode 100644
index 000000000000000..d0e3a82df3ff919
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmla_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_lane_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmla z0.h, z1.h, z2.h[1]
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_lane_idx3(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_lane_idx3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmla z0.h, z1.h, z2.h[3]
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmla_lane_idx7(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmla_lane_idx7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmla z0.h, z1.h, z2.h[7]
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 7)
+  ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.lane.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
index 04ad000f20f070a..987fe1fb5822aa4 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
@@ -1,43 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
 
-define <vscale x 4 x float> @bfmlslb_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslb_f32:
+define <vscale x 8 x bfloat> @bfmls_m(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_m:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmlslb z0.s, z1.h, z2.h
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
-  ret <vscale x 4 x float> %out
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
 }
 
-define <vscale x 4 x float> @bfmlslt_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslt_f32:
+define <vscale x 8 x bfloat> @bfmls_x(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_x:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmlslt z0.s, z1.h, z2.h
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
-  ret <vscale x 4 x float> %out
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
 }
 
-define <vscale x 4 x float> @bfmlslb_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslb_lane_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmlslb z0.s, z1.h, z2.h[7]
-; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
-  ret <vscale x 4 x float> %out
-}
 
-define <vscale x 4 x float> @bfmlslt_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
-; CHECK-LABEL: bfmlslt_lane_f32:
+define <vscale x 8 x bfloat> @bfmls_z(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_z:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmlslt z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:    mov z3.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z3.h
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
-  ret <vscale x 4 x float> %out
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a_z, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
 }
 
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll
new file mode 100644
index 000000000000000..16b4538ffab9e2b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmls_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_lane_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmls z0.h, z1.h, z2.h[1]
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmls_lane_idx3(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_lane_idx3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmls z0.h, z1.h, z2.h[3]
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmls_lane_idx7(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
+; CHECK-LABEL: bfmls_lane_idx7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmls z0.h, z1.h, z2.h[7]
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 7)
+  ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.lane.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll
new file mode 100644
index 000000000000000..2b96d452fba0d50
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmlsl.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 4 x float> @bfmlslb_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslb_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslb z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslt_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslt_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslt z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslb_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslb_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslb z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslt_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslt_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslt z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+  ret <vscale x 4 x float> %out
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll
new file mode 100644
index 000000000000000..a04c5a52139cdfa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmul_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmul_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmul_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmul_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmul_u_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmul_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfmul_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmul z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfmul_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll
new file mode 100644
index 000000000000000..2962d59e707ca20
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfmul_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_lane_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmul z0.h, z0.h, z1.h[1]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> %a,
+                                                                          <vscale x 8 x bfloat> %b,
+                                                                          i32 1)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmul_lane_idx3(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_lane_idx3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmul z0.h, z0.h, z1.h[3]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> %a,
+                                                                          <vscale x 8 x bfloat> %b,
+                                                                          i32 3)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfmul_lane_idx7(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfmul_lane_idx7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmul z0.h, z0.h, z1.h[7]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat> %a,
+                                                                          <vscale x 8 x bfloat> %b,
+                                                                          i32 7)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.lane.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll
new file mode 100644
index 000000000000000..752b5ae9df63076
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfsub_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfsub_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfsub_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfsub_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
+; CHECK-NEXT:    bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @bfsub_u_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfsub_u_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfsub_u(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){
+; CHECK-LABEL: bfsub_u:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> %elt, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @bfsub_u_zeroing(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: bfsub_u_zeroing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    bfsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> zeroinitializer
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                            <vscale x 8 x bfloat> %a_z,
+                                                            <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfadd.s b/llvm/test/MC/AArch64/SVE2p1/bfadd.s
index 1021df12fc05034..a29f3e6af8ba48d 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfadd.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfadd.s
@@ -16,7 +16,7 @@ bfadd   z23.h, p3/m, z23.h, z13.h  // 01100101-00000000-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfadd   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65008db7 <unknown>
 
 movprfx z23, z31
@@ -24,53 +24,53 @@ bfadd   z23.h, p3/m, z23.h, z13.h  // 01100101-00000000-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfadd   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65008db7 <unknown>
 
 bfadd   z0.h, p0/m, z0.h, z0.h  // 01100101-00000000-10000000-00000000
 // CHECK-INST: bfadd   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65008000 <unknown>
 
 bfadd   z21.h, p5/m, z21.h, z10.h  // 01100101-00000000-10010101-01010101
 // CHECK-INST: bfadd   z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65009555 <unknown>
 
 bfadd   z23.h, p3/m, z23.h, z13.h  // 01100101-00000000-10001101-10110111
 // CHECK-INST: bfadd   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65008db7 <unknown>
 
 bfadd   z31.h, p7/m, z31.h, z31.h  // 01100101-00000000-10011111-11111111
 // CHECK-INST: bfadd   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65009fff <unknown>
 
 bfadd   z0.h, z0.h, z0.h  // 01100101-00000000-00000000-00000000
 // CHECK-INST: bfadd   z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65000000 <unknown>
 
 bfadd   z21.h, z10.h, z21.h  // 01100101-00010101-00000001-01010101
 // CHECK-INST: bfadd   z21.h, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x01,0x15,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65150155 <unknown>
 
 bfadd   z23.h, z13.h, z8.h  // 01100101-00001000-00000001-10110111
 // CHECK-INST: bfadd   z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x01,0x08,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 650801b7 <unknown>
 
 bfadd   z31.h, z31.h, z31.h  // 01100101-00011111-00000011-11111111
 // CHECK-INST: bfadd   z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x03,0x1f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 651f03ff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfclamp.s b/llvm/test/MC/AArch64/SVE2p1/bfclamp.s
index d7b85edb1730ed4..aed96f3d91e9876 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfclamp.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfclamp.s
@@ -17,30 +17,30 @@ bfclamp z23.h, z13.h, z8.h  // 01100100-00101000-00100101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfclamp z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x25,0x28,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 642825b7 <unknown>
 
 bfclamp z0.h, z0.h, z0.h  // 01100100-00100000-00100100-00000000
 // CHECK-INST: bfclamp z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x24,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64202400 <unknown>
 
 bfclamp z21.h, z10.h, z21.h  // 01100100-00110101-00100101-01010101
 // CHECK-INST: bfclamp z21.h, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x25,0x35,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64352555 <unknown>
 
 bfclamp z23.h, z13.h, z8.h  // 01100100-00101000-00100101-10110111
 // CHECK-INST: bfclamp z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x25,0x28,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 642825b7 <unknown>
 
 bfclamp z31.h, z31.h, z31.h  // 01100100-00111111-00100111-11111111
 // CHECK-INST: bfclamp z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x27,0x3f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 643f27ff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmax.s b/llvm/test/MC/AArch64/SVE2p1/bfmax.s
index cd67abc498f3bd2..bf69c0a0406868c 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmax.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmax.s
@@ -17,7 +17,7 @@ bfmax   z23.h, p3/m, z23.h, z13.h  // 01100101-00000110-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfmax   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65068db7 <unknown>
 
 movprfx z23, z31
@@ -25,29 +25,29 @@ bfmax   z23.h, p3/m, z23.h, z13.h  // 01100101-00000110-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmax   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65068db7 <unknown>
 
 bfmax   z0.h, p0/m, z0.h, z0.h  // 01100101-00000110-10000000-00000000
 // CHECK-INST: bfmax   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65068000 <unknown>
 
 bfmax   z21.h, p5/m, z21.h, z10.h  // 01100101-00000110-10010101-01010101
 // CHECK-INST: bfmax   z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65069555 <unknown>
 
 bfmax   z23.h, p3/m, z23.h, z13.h  // 01100101-00000110-10001101-10110111
 // CHECK-INST: bfmax   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65068db7 <unknown>
 
 bfmax   z31.h, p7/m, z31.h, z31.h  // 01100101-00000110-10011111-11111111
 // CHECK-INST: bfmax   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x06,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65069fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s b/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s
index 83669ebc42b1fda..8e4ffc31218ab5c 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmaxnm.s
@@ -17,7 +17,7 @@ bfmaxnm z23.h, p3/m, z23.h, z13.h  // 01100101-00000100-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfmaxnm z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65048db7 <unknown>
 
 movprfx z23, z31
@@ -25,30 +25,30 @@ bfmaxnm z23.h, p3/m, z23.h, z13.h  // 01100101-00000100-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmaxnm z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65048db7 <unknown>
 
 bfmaxnm z0.h, p0/m, z0.h, z0.h  // 01100101-00000100-10000000-00000000
 // CHECK-INST: bfmaxnm z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65048000 <unknown>
 
 bfmaxnm z21.h, p5/m, z21.h, z10.h  // 01100101-00000100-10010101-01010101
 // CHECK-INST: bfmaxnm z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65049555 <unknown>
 
 bfmaxnm z23.h, p3/m, z23.h, z13.h  // 01100101-00000100-10001101-10110111
 // CHECK-INST: bfmaxnm z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65048db7 <unknown>
 
 bfmaxnm z31.h, p7/m, z31.h, z31.h  // 01100101-00000100-10011111-11111111
 // CHECK-INST: bfmaxnm z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x04,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65049fff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmin.s b/llvm/test/MC/AArch64/SVE2p1/bfmin.s
index 1bb3a0e6f1f2639..17bf50913271c56 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmin.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmin.s
@@ -17,7 +17,7 @@ bfmin   z23.h, p3/m, z23.h, z13.h  // 01100101-00000111-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfmin   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65078db7 <unknown>
 
 movprfx z23, z31
@@ -25,30 +25,30 @@ bfmin   z23.h, p3/m, z23.h, z13.h  // 01100101-00000111-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmin   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65078db7 <unknown>
 
 bfmin   z0.h, p0/m, z0.h, z0.h  // 01100101-00000111-10000000-00000000
 // CHECK-INST: bfmin   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65078000 <unknown>
 
 bfmin   z21.h, p5/m, z21.h, z10.h  // 01100101-00000111-10010101-01010101
 // CHECK-INST: bfmin   z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65079555 <unknown>
 
 bfmin   z23.h, p3/m, z23.h, z13.h  // 01100101-00000111-10001101-10110111
 // CHECK-INST: bfmin   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65078db7 <unknown>
 
 bfmin   z31.h, p7/m, z31.h, z31.h  // 01100101-00000111-10011111-11111111
 // CHECK-INST: bfmin   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x07,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65079fff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfminnm.s b/llvm/test/MC/AArch64/SVE2p1/bfminnm.s
index 9f444c7ac26ae07..e0cd2adc675eea2 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfminnm.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfminnm.s
@@ -17,7 +17,7 @@ bfminnm z23.h, p3/m, z23.h, z13.h  // 01100101-00000101-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfminnm z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65058db7 <unknown>
 
 movprfx z23, z31
@@ -25,30 +25,30 @@ bfminnm z23.h, p3/m, z23.h, z13.h  // 01100101-00000101-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfminnm z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65058db7 <unknown>
 
 bfminnm z0.h, p0/m, z0.h, z0.h  // 01100101-00000101-10000000-00000000
 // CHECK-INST: bfminnm z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65058000 <unknown>
 
 bfminnm z21.h, p5/m, z21.h, z10.h  // 01100101-00000101-10010101-01010101
 // CHECK-INST: bfminnm z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65059555 <unknown>
 
 bfminnm z23.h, p3/m, z23.h, z13.h  // 01100101-00000101-10001101-10110111
 // CHECK-INST: bfminnm z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65058db7 <unknown>
 
 bfminnm z31.h, p7/m, z31.h, z31.h  // 01100101-00000101-10011111-11111111
 // CHECK-INST: bfminnm z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x05,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65059fff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmla.s b/llvm/test/MC/AArch64/SVE2p1/bfmla.s
index ff257830a13da9b..a265eb8b71df929 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmla.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmla.s
@@ -17,31 +17,31 @@ bfmla   z23.h, z13.h, z0.h[5]  // 01100100-01101000-00001001-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmla   z23.h, z13.h, z0.h[5]
 // CHECK-ENCODING: [0xb7,0x09,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 646809b7 <unknown>
 
 bfmla   z0.h, z0.h, z0.h[0]  // 01100100-00100000-00001000-00000000
 // CHECK-INST: bfmla   z0.h, z0.h, z0.h[0]
 // CHECK-ENCODING: [0x00,0x08,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64200800 <unknown>
 
 bfmla   z21.h, z10.h, z5.h[6]  // 01100100-01110101-00001001-01010101
 // CHECK-INST: bfmla   z21.h, z10.h, z5.h[6]
 // CHECK-ENCODING: [0x55,0x09,0x75,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64750955 <unknown>
 
 bfmla   z23.h, z13.h, z0.h[5]  // 01100100-01101000-00001001-10110111
 // CHECK-INST: bfmla   z23.h, z13.h, z0.h[5]
 // CHECK-ENCODING: [0xb7,0x09,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 646809b7 <unknown>
 
 bfmla   z31.h, z31.h, z7.h[7]  // 01100100-01111111-00001011-11111111
 // CHECK-INST: bfmla   z31.h, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xff,0x0b,0x7f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 647f0bff <unknown>
 
 
@@ -50,7 +50,7 @@ bfmla   z23.h, p3/m, z13.h, z8.h  // 01100101-00101000-00001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfmla   z23.h, p3/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x0d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65280db7 <unknown>
 
 movprfx z23, z31
@@ -58,30 +58,30 @@ bfmla   z23.h, p3/m, z13.h, z8.h  // 01100101-00101000-00001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmla   z23.h, p3/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x0d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65280db7 <unknown>
 
 bfmla   z0.h, p0/m, z0.h, z0.h  // 01100101-00100000-00000000-00000000
 // CHECK-INST: bfmla   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x20,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65200000 <unknown>
 
 bfmla   z21.h, p5/m, z10.h, z21.h  // 01100101-00110101-00010101-01010101
 // CHECK-INST: bfmla   z21.h, p5/m, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x15,0x35,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65351555 <unknown>
 
 bfmla   z23.h, p3/m, z13.h, z8.h  // 01100101-00101000-00001101-10110111
 // CHECK-INST: bfmla   z23.h, p3/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x0d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65280db7 <unknown>
 
 bfmla   z31.h, p7/m, z31.h, z31.h  // 01100101-00111111-00011111-11111111
 // CHECK-INST: bfmla   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x3f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 653f1fff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmls.s b/llvm/test/MC/AArch64/SVE2p1/bfmls.s
index c153b56b9586b19..56713e74adf8f03 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmls.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmls.s
@@ -17,31 +17,31 @@ bfmls   z23.h, z13.h, z0.h[5]  // 01100100-01101000-00001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmls   z23.h, z13.h, z0.h[5]
 // CHECK-ENCODING: [0xb7,0x0d,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64680db7 <unknown>
 
 bfmls   z0.h, z0.h, z0.h[0]  // 01100100-00100000-00001100-00000000
 // CHECK-INST: bfmls   z0.h, z0.h, z0.h[0]
 // CHECK-ENCODING: [0x00,0x0c,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64200c00 <unknown>
 
 bfmls   z21.h, z10.h, z5.h[6]  // 01100100-01110101-00001101-01010101
 // CHECK-INST: bfmls   z21.h, z10.h, z5.h[6]
 // CHECK-ENCODING: [0x55,0x0d,0x75,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64750d55 <unknown>
 
 bfmls   z23.h, z13.h, z0.h[5]  // 01100100-01101000-00001101-10110111
 // CHECK-INST: bfmls   z23.h, z13.h, z0.h[5]
 // CHECK-ENCODING: [0xb7,0x0d,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64680db7 <unknown>
 
 bfmls   z31.h, z31.h, z7.h[7]  // 01100100-01111111-00001111-11111111
 // CHECK-INST: bfmls   z31.h, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xff,0x0f,0x7f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 647f0fff <unknown>
 
 
@@ -50,7 +50,7 @@ bfmls   z23.h, p3/m, z13.h, z8.h  // 01100101-00101000-00101101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfmls   z23.h, p3/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x2d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65282db7 <unknown>
 
 movprfx z23, z31
@@ -58,30 +58,30 @@ bfmls   z23.h, p3/m, z13.h, z8.h  // 01100101-00101000-00101101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmls   z23.h, p3/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x2d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65282db7 <unknown>
 
 bfmls   z0.h, p0/m, z0.h, z0.h  // 01100101-00100000-00100000-00000000
 // CHECK-INST: bfmls   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x20,0x20,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65202000 <unknown>
 
 bfmls   z21.h, p5/m, z10.h, z21.h  // 01100101-00110101-00110101-01010101
 // CHECK-INST: bfmls   z21.h, p5/m, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x35,0x35,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65353555 <unknown>
 
 bfmls   z23.h, p3/m, z13.h, z8.h  // 01100101-00101000-00101101-10110111
 // CHECK-INST: bfmls   z23.h, p3/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x2d,0x28,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65282db7 <unknown>
 
 bfmls   z31.h, p7/m, z31.h, z31.h  // 01100101-00111111-00111111-11111111
 // CHECK-INST: bfmls   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x3f,0x3f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 653f3fff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfmul.s b/llvm/test/MC/AArch64/SVE2p1/bfmul.s
index e0b93bcbb103504..62e7d892468b6f4 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfmul.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfmul.s
@@ -14,25 +14,25 @@
 bfmul   z0.h, z0.h, z0.h[0]  // 01100100-00100000-00101000-00000000
 // CHECK-INST: bfmul   z0.h, z0.h, z0.h[0]
 // CHECK-ENCODING: [0x00,0x28,0x20,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64202800 <unknown>
 
 bfmul   z21.h, z10.h, z5.h[6]  // 01100100-01110101-00101001-01010101
 // CHECK-INST: bfmul   z21.h, z10.h, z5.h[6]
 // CHECK-ENCODING: [0x55,0x29,0x75,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 64752955 <unknown>
 
 bfmul   z23.h, z13.h, z0.h[5]  // 01100100-01101000-00101001-10110111
 // CHECK-INST: bfmul   z23.h, z13.h, z0.h[5]
 // CHECK-ENCODING: [0xb7,0x29,0x68,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 646829b7 <unknown>
 
 bfmul   z31.h, z31.h, z7.h[7]  // 01100100-01111111-00101011-11111111
 // CHECK-INST: bfmul   z31.h, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xff,0x2b,0x7f,0x64]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 647f2bff <unknown>
 
 movprfx  z23.h, p3/m, z31.h
@@ -40,7 +40,7 @@ bfmul   z23.h, p3/m, z23.h, z13.h  // 01100101-00000010-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfmul   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65028db7 <unknown>
 
 movprfx z23, z31
@@ -48,54 +48,54 @@ bfmul   z23.h, p3/m, z23.h, z13.h  // 01100101-00000010-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfmul   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65028db7 <unknown>
 
 bfmul   z0.h, p0/m, z0.h, z0.h  // 01100101-00000010-10000000-00000000
 // CHECK-INST: bfmul   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65028000 <unknown>
 
 bfmul   z21.h, p5/m, z21.h, z10.h  // 01100101-00000010-10010101-01010101
 // CHECK-INST: bfmul   z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65029555 <unknown>
 
 bfmul   z23.h, p3/m, z23.h, z13.h  // 01100101-00000010-10001101-10110111
 // CHECK-INST: bfmul   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65028db7 <unknown>
 
 bfmul   z31.h, p7/m, z31.h, z31.h  // 01100101-00000010-10011111-11111111
 // CHECK-INST: bfmul   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x02,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65029fff <unknown>
 
 bfmul   z0.h, z0.h, z0.h  // 01100101-00000000-00001000-00000000
 // CHECK-INST: bfmul   z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x08,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65000800 <unknown>
 
 bfmul   z21.h, z10.h, z21.h  // 01100101-00010101-00001001-01010101
 // CHECK-INST: bfmul   z21.h, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x09,0x15,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65150955 <unknown>
 
 bfmul   z23.h, z13.h, z8.h  // 01100101-00001000-00001001-10110111
 // CHECK-INST: bfmul   z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x09,0x08,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 650809b7 <unknown>
 
 bfmul   z31.h, z31.h, z31.h  // 01100101-00011111-00001011-11111111
 // CHECK-INST: bfmul   z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x0b,0x1f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 651f0bff <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/bfsub.s b/llvm/test/MC/AArch64/SVE2p1/bfsub.s
index 42cb6772c3a517b..66590a72ed6b4c8 100644
--- a/llvm/test/MC/AArch64/SVE2p1/bfsub.s
+++ b/llvm/test/MC/AArch64/SVE2p1/bfsub.s
@@ -16,7 +16,7 @@ bfsub   z23.h, p3/m, z23.h, z13.h  // 01100101-00000001-10001101-10110111
 // CHECK-INST:  movprfx  z23.h, p3/m, z31.h
 // CHECK-INST: bfsub   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65018db7 <unknown>
 
 movprfx z23, z31
@@ -24,53 +24,53 @@ bfsub   z23.h, p3/m, z23.h, z13.h  // 01100101-00000001-10001101-10110111
 // CHECK-INST:  movprfx z23, z31
 // CHECK-INST: bfsub   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65018db7 <unknown>
 
 bfsub   z0.h, p0/m, z0.h, z0.h  // 01100101-00000001-10000000-00000000
 // CHECK-INST: bfsub   z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65018000 <unknown>
 
 bfsub   z21.h, p5/m, z21.h, z10.h  // 01100101-00000001-10010101-01010101
 // CHECK-INST: bfsub   z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x95,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65019555 <unknown>
 
 bfsub   z23.h, p3/m, z23.h, z13.h  // 01100101-00000001-10001101-10110111
 // CHECK-INST: bfsub   z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x8d,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65018db7 <unknown>
 
 bfsub   z31.h, p7/m, z31.h, z31.h  // 01100101-00000001-10011111-11111111
 // CHECK-INST: bfsub   z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x01,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65019fff <unknown>
 
 bfsub   z0.h, z0.h, z0.h  // 01100101-00000000-00000100-00000000
 // CHECK-INST: bfsub   z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x04,0x00,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65000400 <unknown>
 
 bfsub   z21.h, z10.h, z21.h  // 01100101-00010101-00000101-01010101
 // CHECK-INST: bfsub   z21.h, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x05,0x15,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 65150555 <unknown>
 
 bfsub   z23.h, z13.h, z8.h  // 01100101-00001000-00000101-10110111
 // CHECK-INST: bfsub   z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x05,0x08,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 650805b7 <unknown>
 
 bfsub   z31.h, z31.h, z31.h  // 01100101-00011111-00000111-11111111
 // CHECK-INST: bfsub   z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x07,0x1f,0x65]
-// CHECK-ERROR: instruction requires: b16b16 sme2p1 or sve2p1
+// CHECK-ERROR: instruction requires: b16b16 sve2p1
 // CHECK-UNKNOWN: 651f07ff <unknown>