[llvm] 67e4330 - [sve][acle] Implement some of the C intrinsics for brain float.

Mon Jun 29 09:10:23 PDT 2020

Author: Francesco Petrogalli
Date: 2020-06-29T16:09:08Z
New Revision: 67e4330facfbf798ecc40cd2449f70e6758078b9

URL: https://github.com/llvm/llvm-project/commit/67e4330facfbf798ecc40cd2449f70e6758078b9
DIFF: https://github.com/llvm/llvm-project/commit/67e4330facfbf798ecc40cd2449f70e6758078b9.diff

LOG: [sve][acle] Implement some of the C intrinsics for brain float.

Summary:
The following intrinsics have been extended to support brain float types:

svbfloat16_t svclasta[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t data)
bfloat16_t svclasta[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t data)
bfloat16_t svlasta[_bf16](svbool_t pg, svbfloat16_t op)

svbfloat16_t svclastb[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t data)
bfloat16_t svclastb[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t data)
bfloat16_t svlastb[_bf16](svbool_t pg, svbfloat16_t op)

svbfloat16_t svdup[_n]_bf16(bfloat16_t op)
svbfloat16_t svdup[_n]_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op)
svbfloat16_t svdup[_n]_bf16_x(svbool_t pg, bfloat16_t op)
svbfloat16_t svdup[_n]_bf16_z(svbool_t pg, bfloat16_t op)

svbfloat16_t svdupq[_n]_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3, bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7)
svbfloat16_t svdupq_lane[_bf16](svbfloat16_t data, uint64_t index)

svbfloat16_t svinsr[_n_bf16](svbfloat16_t op1, bfloat16_t op2)

Reviewers: sdesmalen, kmclaughlin, c-rhodes, ctetreau, efriedma

Subscribers: tschuett, hiraditya, rkruppe, psnobl, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D82345

Added: 
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c
    clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c

Modified: 
    clang/include/clang/Basic/arm_sve.td
    clang/lib/CodeGen/CGBuiltin.cpp
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
    llvm/test/CodeGen/AArch64/sve-vector-splat.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 5a0989640eae..1d1b622349ed 100644

--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -725,13 +725,23 @@ def SVADRD : SInst<"svadrd[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "
 
 def SVDUPQ_8  : SInst<"svdupq[_n]_{d}", "dssssssssssssssss",  "cUc", MergeNone>;
 def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUsh", MergeNone>;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "b", MergeNone>;
+}
 def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone>;
 def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone>;
 
-def SVDUP   : SInst<"svdup[_n]_{d}", "ds",   "csilUcUsUiUlhfd", MergeNone,    "aarch64_sve_dup_x">;
-def SVDUP_M : SInst<"svdup[_n]_{d}", "ddPs", "csilUcUsUiUlhfd", MergeOp1,     "aarch64_sve_dup">;
-def SVDUP_X : SInst<"svdup[_n]_{d}", "dPs",  "csilUcUsUiUlhfd", MergeAnyExp,  "aarch64_sve_dup">;
-def SVDUP_Z : SInst<"svdup[_n]_{d}", "dPs",  "csilUcUsUiUlhfd", MergeZeroExp, "aarch64_sve_dup">;
+multiclass svdup_base<string n, string p, MergeType mt, string i> {
+  def NAME : SInst<n, p, "csilUcUsUiUlhfd", mt, i>;
+  let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+    def _BF16: SInst<n, p, "b", mt, i>;
+  }
+}
+
+defm SVDUP   : svdup_base<"svdup[_n]_{d}", "ds",   MergeNone,    "aarch64_sve_dup_x">;
+defm SVDUP_M : svdup_base<"svdup[_n]_{d}", "ddPs", MergeOp1,     "aarch64_sve_dup">;
+defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs",  MergeAnyExp,  "aarch64_sve_dup">;
+defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs",  MergeZeroExp, "aarch64_sve_dup">;
 
 def SVINDEX : SInst<"svindex_{d}",   "dss",  "csilUcUsUiUl",    MergeNone,    "aarch64_sve_index">;
 
@@ -850,8 +860,11 @@ defm SVLSR : SInst_SHIFT<"svlsr", "aarch64_sve_lsr", "UcUsUiUl", "UcUsUi">;
 def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeOp1,  "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeAny,  "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVINSR   : SInst<"svinsr[_n_{d}]", "dds",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">;
 
+def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds",  "b", MergeNone, "aarch64_sve_insr">;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer reductions
@@ -1173,10 +1186,18 @@ def SVCVTXNT_F32    : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch6
 ////////////////////////////////////////////////////////////////////////////////
 // Permutations and selection
 
-def SVCLASTA     : SInst<"svclasta[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_clasta">;
-def SVCLASTA_N   : SInst<"svclasta[_n_{d}]",  "sPsd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_clasta_n">;
-def SVCLASTB     : SInst<"svclastb[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_clastb">;
-def SVCLASTB_N   : SInst<"svclastb[_n_{d}]",  "sPsd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_clastb_n">;
+multiclass SVEPerm<string name, string proto, string i> {
+  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i>;
+  let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+    def: SInst<name, proto, "b", MergeNone, i>;
+  }
+}
+
+defm SVCLASTA    : SVEPerm<"svclasta[_{d}]",   "dPdd", "aarch64_sve_clasta">;
+defm SVCLASTA_N  : SVEPerm<"svclasta[_n_{d}]", "sPsd", "aarch64_sve_clasta_n">;
+defm SVCLASTB    : SVEPerm<"svclastb[_{d}]",   "dPdd", "aarch64_sve_clastb">;
+defm SVCLASTB_N  : SVEPerm<"svclastb[_n_{d}]", "sPsd", "aarch64_sve_clastb_n">;
+
 def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd",        MergeNone, "aarch64_sve_compact">;
 // Note: svdup_lane is implemented using the intrinsic for TBL to represent a
 // splat of any possible lane. It is upto LLVM to pick a more efficient
@@ -1184,9 +1205,12 @@ def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd",        MergeNo
 // instruction's immediate.
 def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">;
 def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane">;
+}
 def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>;
-def SVLASTA      : SInst<"svlasta[_{d}]",     "sPd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_lasta">;
-def SVLASTB      : SInst<"svlastb[_{d}]",     "sPd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_lastb">;
+defm SVLASTA     : SVEPerm<"svlasta[_{d}]",   "sPd",  "aarch64_sve_lasta">;
+defm SVLASTB     : SVEPerm<"svlastb[_{d}]",   "sPd",  "aarch64_sve_lastb">;
 def SVREV        : SInst<"svrev[_{d}]",       "dd",   "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev">;
 def SVSEL        : SInst<"svsel[_{d}]",       "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel">;
 def SVSPLICE     : SInst<"svsplice[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice">;

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b5c4841578c4..0a3adf6fbf24 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8386,6 +8386,7 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   case SVE::BI__builtin_sve_svdupq_n_s64:
   case SVE::BI__builtin_sve_svdupq_n_u16:
   case SVE::BI__builtin_sve_svdupq_n_f16:
+  case SVE::BI__builtin_sve_svdupq_n_bf16:
   case SVE::BI__builtin_sve_svdupq_n_s16:
   case SVE::BI__builtin_sve_svdupq_n_u32:
   case SVE::BI__builtin_sve_svdupq_n_f32:

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c
new file mode 100644
index 000000000000..cf8e829a8819
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta-bfloat.c
@@ -0,0 +1,36 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svclasta_bf16(svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) {
+  // CHECK-LABEL: test_svclasta_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> %[[PG]], <vscale x 8 x bfloat> %fallback, <vscale x 8 x bfloat> %data)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svclasta_bf16'}}
+  return SVE_ACLE_FUNC(svclasta, _bf16, , )(pg, fallback, data);
+}
+
+bfloat16_t test_svclasta_n_bf16(svbool_t pg, bfloat16_t fallback, svbfloat16_t data) {
+  // CHECK-LABEL: test_svclasta_n_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat %fallback, <vscale x 8 x bfloat> %data)
+  // CHECK: ret bfloat %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svclasta_n_bf16'}}
+  return SVE_ACLE_FUNC(svclasta, _n_bf16, , )(pg, fallback, data);
+}

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c
new file mode 100644
index 000000000000..3dd84dc705fc
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb-bfloat.c
@@ -0,0 +1,36 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svclastb_bf16(svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) {
+  // CHECK-LABEL: test_svclastb_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> %[[PG]], <vscale x 8 x bfloat> %fallback, <vscale x 8 x bfloat> %data)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svclastb_bf16'}}
+  return SVE_ACLE_FUNC(svclastb, _bf16, , )(pg, fallback, data);
+}
+
+bfloat16_t test_svclastb_n_bf16(svbool_t pg, bfloat16_t fallback, svbfloat16_t data) {
+  // CHECK-LABEL: test_svclastb_n_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> %[[PG]], bfloat %fallback, <vscale x 8 x bfloat> %data)
+  // CHECK: ret bfloat %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svclastb_n_bf16'}}
+  return SVE_ACLE_FUNC(svclastb, _n_bf16, , )(pg, fallback, data);
+}

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c
new file mode 100644
index 000000000000..4e0508f39412
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup-bfloat.c
@@ -0,0 +1,53 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svdup_n_bf16(bfloat16_t op) {
+  // CHECK-LABEL: test_svdup_n_bf16
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svdup_n_bf16'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16, )(op);
+}
+
+svbfloat16_t test_svdup_n_bf16_z(svbool_t pg, bfloat16_t op) {
+  // CHECK-LABEL: test_svdup_n_bf16_z
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %[[PG]], bfloat %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svdup_n_bf16_z'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16_z, )(pg, op);
+}
+
+svbfloat16_t test_svdup_n_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op) {
+  // CHECK-LABEL: test_svdup_n_bf16_m
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %[[PG]], bfloat %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svdup_n_bf16_m'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16_m, )(inactive, pg, op);
+}
+
+svbfloat16_t test_svdup_n_bf16_x(svbool_t pg, bfloat16_t op) {
+  // CHECK-LABEL: test_svdup_n_bf16_x
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %[[PG]], bfloat %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svdup_n_bf16_x'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16_x, )(pg, op);
+}

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c
new file mode 100644
index 000000000000..2b3603242eed
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c
@@ -0,0 +1,42 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svdupq_lane_bf16(svbfloat16_t data, uint64_t index) {
+  // CHECK-LABEL: test_svdupq_lane_bf16
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %data, i64 %index)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svdupq_lane_bf16'}}
+  return SVE_ACLE_FUNC(svdupq_lane, _bf16, , )(data, index);
+}
+svbfloat16_t test_svdupq_n_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3,
+                                bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) {
+  // CHECK-LABEL: test_svdupq_n_bf16
+  // CHECK: %[[ALLOCA:.*]] = alloca [8 x bfloat], align 16
+  // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 0
+  // CHECK-DAG: store bfloat %x0, bfloat* %[[BASE]], align 16
+  // <assume other stores>
+  // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 7
+  // CHECK: store bfloat %x7, bfloat* %[[GEP]], align 2
+  // CHECK-NOT: store
+  // CHECK: call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %{{.*}}, bfloat* nonnull %[[BASE]])
+  // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+  // expected-warning at +1 {{implicit declaration of function 'svdupq_n_bf16'}}
+  return SVE_ACLE_FUNC(svdupq, _n, _bf16, )(x0, x1, x2, x3, x4, x5, x6, x7);
+}

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c
new file mode 100644
index 000000000000..30d65995fbda
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr-bfloat.c
@@ -0,0 +1,26 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svinsr_n_bf16(svbfloat16_t op1, bfloat16_t op2) {
+  // CHECK-LABEL: test_svinsr_n_bf16
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> %op1, bfloat %op2)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svinsr_n_bf16'}}
+  return SVE_ACLE_FUNC(svinsr, _n_bf16, , )(op1, op2);
+}

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c
new file mode 100644
index 000000000000..cb02d2dc60ad
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lasta-bfloat.c
@@ -0,0 +1,27 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+bfloat16_t test_svlasta_bf16(svbool_t pg, svbfloat16_t op) {
+  // CHECK-LABEL: test_svlasta_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> %[[PG]], <vscale x 8 x bfloat> %op)
+  // CHECK: ret bfloat %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svlasta_bf16'}}
+  return SVE_ACLE_FUNC(svlasta, _bf16, , )(pg, op);
+}

diff  --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c
new file mode 100644
index 000000000000..5c8abb55a077
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_lastb-bfloat.c
@@ -0,0 +1,27 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
+// RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+// If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
+// ASM-NOT: warning
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+bfloat16_t test_svlastb_bf16(svbool_t pg, svbfloat16_t op) {
+  // CHECK-LABEL: test_svlastb_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> %[[PG]], <vscale x 8 x bfloat> %op)
+  // CHECK: ret bfloat %[[INTRINSIC]]
+  // expected-warning at +1 {{implicit declaration of function 'svlastb_bf16'}}
+  return SVE_ACLE_FUNC(svlastb, _bf16, , )(pg, op);
+}

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d3a1c2789cfe..0344aad85030 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -423,6 +423,11 @@ let Predicates = [HasSVE] in {
   defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
   defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
+              (CPY_ZPmV_H $passthru, $pg, $splat)>;
+  }
+
   // Duplicate FP scalar into all vector elements
   def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
@@ -436,6 +441,10 @@ let Predicates = [HasSVE] in {
             (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
   def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
             (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+              (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  }
 
   // Duplicate +0.0 into all vector elements
   def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
@@ -444,6 +453,9 @@ let Predicates = [HasSVE] in {
   def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
   def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
   def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  }
 
   // Duplicate Int immediate into all vector elements
   def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
@@ -486,6 +498,10 @@ let Predicates = [HasSVE] in {
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
+  }
+
   defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
   defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
   defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
@@ -554,11 +570,23 @@ let Predicates = [HasSVE] in {
   defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
   defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_3_Op_Pat<bf16,     AArch64clasta_n,        nxv8i1, bf16,     nxv8bf16, CLASTA_VPZ_H>;
+    def : SVE_3_Op_Pat<bf16,     AArch64clastb_n,        nxv8i1, bf16,     nxv8bf16, CLASTB_VPZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
+  }
+
   defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
   defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
   defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
   defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
+    def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
+  }
+
   // continuous load with reg+immediate
   defm LD1B_IMM    : sve_mem_cld_si<0b0000, "ld1b",  Z_b, ZPR8>;
   defm LD1B_H_IMM  : sve_mem_cld_si<0b0001, "ld1b",  Z_h, ZPR16>;
@@ -1499,6 +1527,13 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
     def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
     def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
     def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
+  }
+
+  let Predicates = [IsLE, HasBF16, HasSVE] in {
+    def : Pat<(nxv2i64  (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
   }
 
   let Predicates = [IsLE, HasSVE, HasBF16] in {

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
index d6f240a15bed..788592c131bc 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@@ -81,6 +81,14 @@ define <vscale x 8 x half> @dup_f16(half %b) {
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @dup_bf16(bfloat %b) #0 {
+; CHECK-LABEL: dup_bf16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 8 x half> @dup_imm_f16(half %b) {
 ; CHECK-LABEL: dup_imm_f16:
 ; CHECK: mov z0.h, #16.00000000
@@ -126,5 +134,9 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
index e5866c1b68ff..433a1cdfd8e9 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -57,6 +57,16 @@ define <vscale x 8 x half> @clasta_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @clasta_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clasta_bf16:
+; CHECK: clasta z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @clasta_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clasta_f32:
 ; CHECK: clasta z0.s, p0, z0.s, z1.s
@@ -131,6 +141,16 @@ define half @clasta_n_f16(<vscale x 8 x i1> %pg, half %a, <vscale x 8 x half> %b
   ret half %out
 }
 
+define bfloat @clasta_n_bf16(<vscale x 8 x i1> %pg, bfloat %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clasta_n_bf16:
+; CHECK: clasta h0, p0, h0, z1.h
+; CHECK-NEXT: ret
+  %out = call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                         bfloat %a,
+                                                         <vscale x 8 x bfloat> %b)
+  ret bfloat %out
+}
+
 define float @clasta_n_f32(<vscale x 4 x i1> %pg, float %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clasta_n_f32:
 ; CHECK: clasta s0, p0, s0, z1.s
@@ -205,6 +225,16 @@ define <vscale x 8 x half> @clastb_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @clastb_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clastb_bf16:
+; CHECK: clastb z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @clastb_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clastb_f32:
 ; CHECK: clastb z0.s, p0, z0.s, z1.s
@@ -279,6 +309,16 @@ define half @clastb_n_f16(<vscale x 8 x i1> %pg, half %a, <vscale x 8 x half> %b
   ret half %out
 }
 
+define bfloat @clastb_n_bf16(<vscale x 8 x i1> %pg, bfloat %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: clastb_n_bf16:
+; CHECK: clastb h0, p0, h0, z1.h
+; CHECK-NEXT: ret
+  %out = call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                         bfloat %a,
+                                                         <vscale x 8 x bfloat> %b)
+  ret bfloat %out
+}
+
 define float @clastb_n_f32(<vscale x 4 x i1> %pg, float %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clastb_n_f32:
 ; CHECK: clastb s0, p0, s0, z1.s
@@ -343,6 +383,14 @@ define <vscale x 8 x half> @dupq_f16(<vscale x 8 x half> %a) {
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @dupq_bf16(<vscale x 8 x bfloat> %a) #0 {
+; CHECK-LABEL: dupq_bf16:
+; CHECK: mov z0.q, q0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %a, i64 0)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @dupq_f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: dupq_f32:
 ; CHECK: mov z0.q, z0.q[1]
@@ -432,6 +480,20 @@ define <vscale x 8 x half> @dupq_lane_f16(<vscale x 8 x half> %a, i64 %idx) {
   ret <vscale x 8 x half> %out
 }
 
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 8 x bfloat> @dupq_lane_bf16(<vscale x 8 x bfloat> %a, i64 %idx) #0 {
+; CHECK-LABEL: dupq_lane_bf16:
+; CHECK-DAG:  index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG:  and   [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG:  add   [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG:  mov   [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK:      add   [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %a, i64 %idx)
+  ret <vscale x 8 x bfloat> %out
+}
+
 ; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
 define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_f32:
@@ -605,6 +667,15 @@ define half @lasta_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
   ret half %res
 }
 
+define bfloat @lasta_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a) #0 {
+; CHECK-LABEL: lasta_bf16
+; CHECK: lasta h0, p0, z0.h
+; CHECK-NEXT: ret
+  %res = call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                      <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
 define float @lasta_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: lasta_f32
 ; CHECK: lasta s0, p0, z0.s
@@ -681,6 +752,15 @@ define half @lastb_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
   ret half %res
 }
 
+define bfloat @lastb_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a) #0 {
+; CHECK-LABEL: lastb_bf16
+; CHECK: lastb h0, p0, z0.h
+; CHECK-NEXT: ret
+  %res = call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                      <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
 define float @lastb_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: lastb_f32
 ; CHECK: lastb s0, p0, z0.s
@@ -1851,6 +1931,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.clasta.nxv8i16(<vscale x 8 x i1>, <
 declare <vscale x 4 x i32> @llvm.aarch64.sve.clasta.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.clasta.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 8 x half> @llvm.aarch64.sve.clasta.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.clasta.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.clasta.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 
@@ -1859,6 +1940,7 @@ declare i16 @llvm.aarch64.sve.clasta.n.nxv8i16(<vscale x 8 x i1>, i16, <vscale x
 declare i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1>, half, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1>, bfloat, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1>, float, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1>, double, <vscale x 2 x double>)
 
@@ -1867,6 +1949,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.clastb.nxv8i16(<vscale x 8 x i1>, <
 declare <vscale x 4 x i32> @llvm.aarch64.sve.clastb.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.clastb.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 8 x half> @llvm.aarch64.sve.clastb.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.clastb.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.clastb.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 
@@ -1875,6 +1958,7 @@ declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(<vscale x 8 x i1>, i16, <vscale x
 declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1>, half, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1>, bfloat, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1>, float, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1>, double, <vscale x 2 x double>)
 
@@ -1888,6 +1972,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
 
@@ -1905,6 +1990,7 @@ declare i16 @llvm.aarch64.sve.lasta.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16
 declare i32 @llvm.aarch64.sve.lasta.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.lasta.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.lasta.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.lasta.nxv2f32(<vscale x 2 x i1>, <vscale x 2 x float>)
 declare float @llvm.aarch64.sve.lasta.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.lasta.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
@@ -1914,6 +2000,7 @@ declare i16 @llvm.aarch64.sve.lastb.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16
 declare i32 @llvm.aarch64.sve.lastb.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.lastb.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.lastb.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.lastb.nxv2f32(<vscale x 2 x i1>, <vscale x 2 x float>)
 declare float @llvm.aarch64.sve.lastb.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.lastb.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
index a9ff1648f3b1..9f679aa6dc4f 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
@@ -57,6 +57,16 @@ define <vscale x 8 x half> @dup_f16(<vscale x 8 x half> %a, <vscale x 8 x i1> %p
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @dup_bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, bfloat %b) #0 {
+; CHECK-LABEL: dup_bf16:
+; CHECK: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %a,
+                                                                   <vscale x 8 x i1> %pg,
+                                                                   bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @dup_f32(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, float %b) {
 ; CHECK-LABEL: dup_f32:
 ; CHECK: mov z0.s, p0/m, s1
@@ -77,10 +87,41 @@ define <vscale x 2 x double> @dup_f64(<vscale x 2 x double> %a, <vscale x 2 x i1
   ret <vscale x 2 x double> %out
 }
 
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_z(<vscale x 8 x i1> %pg, bfloat %op) #0 {
+; CHECK-LABEL: test_svdup_n_bf16_z:
+; CHECK: mov z1.h, #0
+; CHECK: mov z1.h, p0/m, h0
+; CHECK: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %pg, bfloat %op)
+  ret <vscale x 8 x bfloat> %out
+}
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_m(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %pg, bfloat %op) #0 {
+; CHECK-LABEL: test_svdup_n_bf16_m:
+; CHECK: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %pg, bfloat %op)
+  ret <vscale x 8 x bfloat> %out
+}
+
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_x(<vscale x 8 x i1> %pg, bfloat %op) #0 {
+; CHECK-LABEL: test_svdup_n_bf16_x:
+; CHECK: mov z0.h, p0/m, h0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %pg, bfloat %op)
+  ret <vscale x 8 x bfloat> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
index 0333eb79714b..dd884d5577f0 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
@@ -165,6 +165,14 @@ define <vscale x 8 x half> @insr_f16(<vscale x 8 x half> %a, half %b) {
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @insr_bf16(<vscale x 8 x bfloat> %a, bfloat %b) #0 {
+; CHECK-LABEL: insr_bf16:
+; CHECK: insr z0.h, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> %a, bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @insr_f32(<vscale x 4 x float> %a, float %b) {
 ; CHECK-LABEL: insr_f32:
 ; CHECK: insr z0.s, s1
@@ -348,6 +356,7 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16>, i1
 declare <vscale x 4 x i32> @llvm.aarch64.sve.insr.nxv4i32(<vscale x 4 x i32>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.insr.nxv2i64(<vscale x 2 x i64>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.insr.nxv8f16(<vscale x 8 x half>, half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat>, bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float>, float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double>, double)
 
@@ -368,3 +377,6 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vsc
 declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
index 5bacbee042c0..af43f8fc97e9 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -172,6 +172,15 @@ define <vscale x 16 x i1> @sve_splat_16xi1(i1 %val) {
 
 ;; Splats of legal floating point vector types
 
+define <vscale x 8 x bfloat> @splat_nxv8bf16(bfloat %val) #0 {
+; CHECK-LABEL: splat_nxv8bf16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 8 x bfloat> undef, bfloat %val, i32 0
+  %2 = shufflevector <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %2
+}
+
 define <vscale x 8 x half> @splat_nxv8f16(half %val) {
 ; CHECK-LABEL: splat_nxv8f16:
 ; CHECK: mov z0.h, h0
@@ -233,6 +242,13 @@ define <vscale x 8 x half> @splat_nxv8f16_zero() {
   ret <vscale x 8 x half> zeroinitializer
 }
 
+define <vscale x 8 x bfloat> @splat_nxv8bf16_zero() #0 {
+; CHECK-LABEL: splat_nxv8bf16_zero:
+; CHECK: mov z0.h, #0
+; CHECK-NEXT: ret
+  ret <vscale x 8 x bfloat> zeroinitializer
+}
+
 define <vscale x 4 x half> @splat_nxv4f16_zero() {
 ; CHECK-LABEL: splat_nxv4f16_zero:
 ; CHECK: mov z0.h, #0
@@ -321,3 +337,6 @@ define <vscale x 2 x double> @splat_nxv2f64_imm() {
   %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
   ret <vscale x 2 x double> %2
 }
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }