[llvm] 4e4bba7 - [AArch64][llvm] Add intrinsics for SVE BFSCALE (#172025)

Wed Dec 17 07:06:02 PST 2025

Author: Jonathan Thackray
Date: 2025-12-17T15:05:57Z
New Revision: 4e4bba73320c542203ef102d7b392a6c01859080

URL: https://github.com/llvm/llvm-project/commit/4e4bba73320c542203ef102d7b392a6c01859080
DIFF: https://github.com/llvm/llvm-project/commit/4e4bba73320c542203ef102d7b392a6c01859080.diff

LOG: [AArch64][llvm] Add intrinsics for SVE BFSCALE (#172025)

Add AArch64 intrinsics for BFloat16 floating-point adjust exponent
vectors:

```c
  svbfloat16_t svscale[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svint16_t zm);
  svbfloat16_t svscale[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svint16_t zm);
  svbfloat16_t svscale[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svint16_t zm);
  svbfloat16_t svscale[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, int16_t zm);
  svbfloat16_t svscale[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, int16_t zm);
  svbfloat16_t svscale[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, int16_t zm);
```

Added: 
    clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfscale.c
    clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve-bfscale___sme_AND_sve-bfscale_AND_sme2.c
    llvm/test/CodeGen/AArch64/sve-intrinsics-bfscale.ll

Modified: 
    clang/include/clang/Basic/arm_sve.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td
    llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 3597418c5ac04..b68dab4757581 100644

--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -796,6 +796,16 @@ def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1,  "aarch64_sv
 def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny,  "aarch64_sve_fscale", [VerifyRuntimeMode]>;
 def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [VerifyRuntimeMode]>;
 
+let SVETargetGuard = "sve-bfscale", SMETargetGuard = "sve-bfscale,sme2" in {
+  def SVBFSCALE_M : SInst<"svscale[_{d}]",   "dPdx", "b", MergeOp1,  "aarch64_sve_fscale", [VerifyRuntimeMode]>;
+  def SVBFSCALE_X : SInst<"svscale[_{d}]",   "dPdx", "b", MergeAny,  "aarch64_sve_fscale", [VerifyRuntimeMode]>;
+  def SVBFSCALE_Z : SInst<"svscale[_{d}]",   "dPdx", "b", MergeZero, "aarch64_sve_fscale", [VerifyRuntimeMode]>;
+
+  def SVBFSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "b", MergeOp1,  "aarch64_sve_fscale", [VerifyRuntimeMode]>;
+  def SVBFSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "b", MergeAny,  "aarch64_sve_fscale", [VerifyRuntimeMode]>;
+  def SVBFSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "b", MergeZero, "aarch64_sve_fscale", [VerifyRuntimeMode]>;
+}
+
 defm SVMAD_F  : SInstZPZZZ<"svmad",  "hfd", "aarch64_sve_fmad",  "aarch64_sve_fmla_u",  [VerifyRuntimeMode, ReverseMergeAnyAccOp]>;
 defm SVMLA_F  : SInstZPZZZ<"svmla",  "hfd", "aarch64_sve_fmla",  "aarch64_sve_fmla_u", [VerifyRuntimeMode]>;
 defm SVMLS_F  : SInstZPZZZ<"svmls",  "hfd", "aarch64_sve_fmls",  "aarch64_sve_fmls_u", [VerifyRuntimeMode]>;

diff  --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfscale.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfscale.c
new file mode 100644
index 0000000000000..b992060a78587
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_bfscale.c
@@ -0,0 +1,142 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-bfscale -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-bfscale -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-bfscale -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sve.h>
+
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_svscale_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_bf16_zu10__SVBool_tu14__SVBfloat16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svscale_bf16_z(svbool_t pg, svbfloat16_t op1, svint16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svscale,_bf16,_z,)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_bf16_mu10__SVBool_tu14__SVBfloat16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svscale_bf16_m(svbool_t pg, svbfloat16_t op1, svint16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svscale,_bf16,_m,)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svscale_bf16_xu10__SVBool_tu14__SVBfloat16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svscale_bf16_x(svbool_t pg, svbfloat16_t op1, svint16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svscale,_bf16,_x,)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_n_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svscale_n_bf16_zu10__SVBool_tu14__SVBfloat16_ts(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svscale_n_bf16_z(svbool_t pg, svbfloat16_t op1, int16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svscale,_n_bf16,_z,)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_n_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svscale_n_bf16_mu10__SVBool_tu14__SVBfloat16_ts(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svscale_n_bf16_m(svbool_t pg, svbfloat16_t op1, int16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svscale,_n_bf16,_m,)(pg, op1, op2);
+}
+
+// CHECK-LABEL: @test_svscale_n_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[OP2:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svscale_n_bf16_xu10__SVBool_tu14__SVBfloat16_ts(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[OP2:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svscale_n_bf16_x(svbool_t pg, svbfloat16_t op1, int16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svscale,_n_bf16,_x,)(pg, op1, op2);
+}

diff  --git a/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve-bfscale___sme_AND_sve-bfscale_AND_sme2.c b/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve-bfscale___sme_AND_sve-bfscale_AND_sme2.c
new file mode 100644
index 0000000000000..4c64f566cc786
--- /dev/null
+++ b/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve-bfscale___sme_AND_sve-bfscale_AND_sme2.c
@@ -0,0 +1,94 @@
+// NOTE: File has been autogenerated by utils/aarch64_builtins_test_generator.py
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +sve-bfscale -verify=guard
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve -target-feature +sve-bfscale -verify
+// expected-no-diagnostics
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+// Properties: guard="sve,sve-bfscale" streaming_guard="sme,sve-bfscale,sme2" flags="feature-dependent"
+
+void test(void) {
+  int16_t int16_t_val;
+  svbfloat16_t svbfloat16_t_val;
+  svbool_t svbool_t_val;
+  svint16_t svint16_t_val;
+
+  svscale_bf16_m(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  svscale_bf16_x(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  svscale_bf16_z(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  svscale_m(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  svscale_m(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  svscale_n_bf16_m(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  svscale_n_bf16_x(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  svscale_n_bf16_z(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  svscale_x(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  svscale_x(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  svscale_z(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  svscale_z(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+}
+
+void test_streaming(void) __arm_streaming{
+  int16_t int16_t_val;
+  svbfloat16_t svbfloat16_t_val;
+  svbool_t svbool_t_val;
+  svint16_t svint16_t_val;
+
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_bf16_m(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_bf16_x(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_bf16_z(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_m(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_m(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_n_bf16_m(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_n_bf16_x(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_n_bf16_z(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_x(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_x(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_z(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_z(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+}
+
+void test_streaming_compatible(void) __arm_streaming_compatible{
+  int16_t int16_t_val;
+  svbfloat16_t svbfloat16_t_val;
+  svbool_t svbool_t_val;
+  svint16_t svint16_t_val;
+
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_bf16_m(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_bf16_x(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_bf16_z(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_m(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_m(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_n_bf16_m(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_n_bf16_x(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_n_bf16_z(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_x(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_x(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_z(svbool_t_val, svbfloat16_t_val, int16_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svscale_z(svbool_t_val, svbfloat16_t_val, svint16_t_val);
+}

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c22929f379dfc..b90f3552ed398 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -211,7 +211,7 @@ def HasSME2p2        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">;
 def HasSVEAES2       : Predicate<"Subtarget->hasSVEAES2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">;
-def HasSVEBFSCALE    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
+def HasSVEBFSCALE    : Predicate<"Subtarget->isNonStreamingSVEorSME2Available() && Subtarget->hasSVE_BFSCALE()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">;
 def HasSVE_F16F32MM  : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">;

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ffb24dfbcd527..4c274de776eba 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4537,7 +4537,7 @@ defm BFMAX_ZPZZ   : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fmax>;
 } // HasSVEB16B16, HasNonStreamingSVE_or_SME2, UseExperimentalZeroingPseudos
 
 let Predicates = [HasSVEBFSCALE] in {
-  def BFSCALE_ZPZZ : sve_fp_2op_p_zds_bfscale<0b1001, "bfscale", DestructiveBinary>;
+  defm BFSCALE_ZPZZ : sve_fp_2op_p_zds_bfscale<0b1001, "bfscale", int_aarch64_sve_fscale, DestructiveBinary>;
 } // HasSVEBFSCALE
 //===----------------------------------------------------------------------===//
 // SME2.1 or SVE2.1 instructions

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c80e985cd774a..00afcb886df24 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2325,9 +2325,13 @@ multiclass sve_fp_2op_p_zds_bfloat<bits<4> opc, string asm, string Ps,
   def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
 }
 
-class  sve_fp_2op_p_zds_bfscale<bits<4> opc, string asm,  DestructiveInstTypeEnum flags>
-: sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>{
-  let DestructiveInstType = flags;
+multiclass sve_fp_2op_p_zds_bfscale<bits<4> opc, string asm, SDPatternOperator op,
+                                    DestructiveInstTypeEnum flags> {
+  let DestructiveInstType = flags in {
+  def _H : sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>;
+  }
+
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>;
 }
 
 multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll
index b049cd4844982..3254ee4cb6581 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll
@@ -54,3 +54,15 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <v
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>  } @llvm.aarch64.sve.fscale.x4.nxv8bf16(<vscale x 8 x bfloat> %zdn1, <vscale x 8 x bfloat> %zdn2, <vscale x 8 x bfloat> %zdn3, <vscale x 8 x bfloat> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>  } %res
 }
+
+define <vscale x 8 x bfloat> @bfscale_h(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: bfscale_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfscale z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x i16> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-bfscale.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-bfscale.ll
new file mode 100644
index 0000000000000..4ea5f0b8dd3b7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-bfscale.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve-bfscale < %s | FileCheck %s
+
+define <vscale x 8 x bfloat> @bfscale_h(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: bfscale_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfscale z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fscale.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x i16> %b)
+  ret <vscale x 8 x bfloat> %out
+}