[clang] [Clang][AArch64] Require SVE or SSVE for scalable types. (PR #91356)

Tue May 7 09:30:16 PDT 2024

https://github.com/sdesmalen-arm created https://github.com/llvm/llvm-project/pull/91356

Scalable types are only available when:
* The function is compiled with +sve
* The function is compiled with +sme and the function is executed in Streaming-SVE mode.

>From 1cc17981a612dcb31fba86c5d64c444f26a44d38 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 3 May 2024 13:07:18 +0100
Subject: [PATCH] [Clang][AArch64] Require SVE or SSVE for scalable types.

Scalable types are only available when:
* The function is compiled with +sve
* The function is compiled with +sme and the function is executed in Streaming-SVE mode.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +
 clang/lib/Sema/Sema.cpp                       | 11 ++-
 clang/lib/Sema/SemaDecl.cpp                   | 16 ++--
 .../acle_sme2_reinterpret_svcount_svbool.c    | 10 ++-
 .../aarch64-sve2-intrinsics/acle_sve2_revd.c  | 78 ++++++++++---------
 .../acle_sve2p1_bfadd.c                       | 18 +++--
 .../acle_sve2p1_bfmax.c                       | 18 +++--
 .../acle_sve2p1_bfmaxnm.c                     | 18 +++--
 .../acle_sve2p1_bfmin.c                       | 18 +++--
 .../acle_sve2p1_bfminnm.c                     | 18 +++--
 .../acle_sve2p1_bfmla.c                       | 18 +++--
 .../acle_sve2p1_bfmls.c                       | 18 +++--
 .../acle_sve2p1_bfmul.c                       | 18 +++--
 .../acle_sve2p1_bfsub.c                       | 18 +++--
 .../acle_sve2p1_create2_bool.c                | 20 ++---
 .../acle_sve2p1_get4_bool.c                   | 24 +++---
 .../acle_sve2p1_undef_bool.c                  | 10 ++-
 .../Sema/aarch64-sme2-sve2p1-diagnostics.c    |  2 +
 clang/test/Sema/aarch64-sme2p1-diagnostics.c  |  2 +-
 19 files changed, 212 insertions(+), 125 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9a0bae9c216d..72326d4509cd 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3205,6 +3205,8 @@ def warn_attribute_arm_zt0_builtin_no_zt0_state : Warning<
   InGroup<DiagGroup<"undefined-arm-zt0">>;
 def err_sve_vector_in_non_sve_target : Error<
   "SVE vector type %0 cannot be used in a target without sve">;
+def err_sve_vector_in_non_streaming_function : Error<
+  "SVE vector type %0 cannot be used in a non-streaming function">;
 def err_attribute_riscv_rvv_bits_unsupported : Error<
   "%0 is only supported when '-mrvv-vector-bits=<bits>' is specified with a "
   "value of \"zvl\" or a power 2 in the range [64,65536]">;
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index a1e32d391ed0..92f859b7146c 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2056,9 +2056,14 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
     if (Ty->isSVESizelessBuiltinType() && FD && FD->hasBody()) {
       llvm::StringMap<bool> CallerFeatureMap;
       Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-      if (!Builtin::evaluateRequiredTargetFeatures("sve", CallerFeatureMap) &&
-          !Builtin::evaluateRequiredTargetFeatures("sme", CallerFeatureMap))
-        Diag(D->getLocation(), diag::err_sve_vector_in_non_sve_target) << Ty;
+      if (!Builtin::evaluateRequiredTargetFeatures("sve", CallerFeatureMap)) {
+        if (!Builtin::evaluateRequiredTargetFeatures("sme", CallerFeatureMap))
+          Diag(D->getLocation(), diag::err_sve_vector_in_non_sve_target) << Ty;
+        else if (!IsArmStreamingFunction(FD, /*IncludeLocallyStreaming=*/true)) {
+          Diag(D->getLocation(), diag::err_sve_vector_in_non_streaming_function)
+              << Ty;
+        }
+      }
     }
   };
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 590f37837eb2..0d11a2acf256 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8982,11 +8982,17 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) {
     const FunctionDecl *FD = cast<FunctionDecl>(CurContext);
     llvm::StringMap<bool> CallerFeatureMap;
     Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-    if (!Builtin::evaluateRequiredTargetFeatures(
-        "sve", CallerFeatureMap)) {
-      Diag(NewVD->getLocation(), diag::err_sve_vector_in_non_sve_target) << T;
-      NewVD->setInvalidDecl();
-      return;
+
+    if (!Builtin::evaluateRequiredTargetFeatures("sve", CallerFeatureMap)) {
+      if (!Builtin::evaluateRequiredTargetFeatures("sme", CallerFeatureMap)) {
+        Diag(NewVD->getLocation(), diag::err_sve_vector_in_non_sve_target) << T;
+        NewVD->setInvalidDecl();
+      } else if (!IsArmStreamingFunction(FD, /*IncludeLocallyStreaming=*/true)) {
+        Diag(NewVD->getLocation(),
+             diag::err_sve_vector_in_non_streaming_function)
+            << T;
+        NewVD->setInvalidDecl();
+      }
     }
   }
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index b3d5f4a4c4a5..c225c5c6c669 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -9,6 +9,12 @@
 
 #include <arm_sme.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.§
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
@@ -26,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
-svbool_t test_svreinterpret_svbool_svcnt(svcount_t cnt) __arm_streaming_compatible
+svbool_t test_svreinterpret_svbool_svcnt(svcount_t cnt) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret,_b,,)(cnt);
 }
@@ -41,7 +47,7 @@ svbool_t test_svreinterpret_svbool_svcnt(svcount_t cnt) __arm_streaming_compatib
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP0]]
 //
-svcount_t test_svreinterpret_svcnt_svbool(svbool_t pg) __arm_streaming_compatible
+svcount_t test_svreinterpret_svcnt_svbool(svbool_t pg) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret,_c,,)(pg);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
index 128a7eb102da..d2a4e1669a1c 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_revd.c
@@ -12,6 +12,12 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
@@ -29,7 +35,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svrevd_s8_z(svbool_t pg, svint8_t op) {
+svint8_t test_svrevd_s8_z(svbool_t pg, svint8_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s8, _z, )(pg, op);
 }
 
@@ -45,7 +51,7 @@ svint8_t test_svrevd_s8_z(svbool_t pg, svint8_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svrevd_s16_z(svbool_t pg, svint16_t op) {
+svint16_t test_svrevd_s16_z(svbool_t pg, svint16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s16, _z, )(pg, op);
 }
 
@@ -61,7 +67,7 @@ svint16_t test_svrevd_s16_z(svbool_t pg, svint16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svrevd_s32_z(svbool_t pg, svint32_t op) {
+svint32_t test_svrevd_s32_z(svbool_t pg, svint32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s32, _z, )(pg, op);
 }
 
@@ -77,7 +83,7 @@ svint32_t test_svrevd_s32_z(svbool_t pg, svint32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svrevd_s64_z(svbool_t pg, svint64_t op) {
+svint64_t test_svrevd_s64_z(svbool_t pg, svint64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s64, _z, )(pg, op);
 }
 
@@ -91,7 +97,7 @@ svint64_t test_svrevd_s64_z(svbool_t pg, svint64_t op) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svrevd_u8_z(svbool_t pg, svuint8_t op) {
+svuint8_t test_svrevd_u8_z(svbool_t pg, svuint8_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u8, _z, )(pg, op);
 }
 // CHECK-LABEL: @test_svrevd_u16_z(
@@ -106,7 +112,7 @@ svuint8_t test_svrevd_u8_z(svbool_t pg, svuint8_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svrevd_u16_z(svbool_t pg, svuint16_t op) {
+svuint16_t test_svrevd_u16_z(svbool_t pg, svuint16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u16, _z, )(pg, op);
 }
 
@@ -122,7 +128,7 @@ svuint16_t test_svrevd_u16_z(svbool_t pg, svuint16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svrevd_u32_z(svbool_t pg, svuint32_t op) {
+svuint32_t test_svrevd_u32_z(svbool_t pg, svuint32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u32, _z, )(pg, op);
 }
 
@@ -138,7 +144,7 @@ svuint32_t test_svrevd_u32_z(svbool_t pg, svuint32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svrevd_u64_z(svbool_t pg, svuint64_t op) {
+svuint64_t test_svrevd_u64_z(svbool_t pg, svuint64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u64, _z, )(pg, op);
 }
 
@@ -152,7 +158,7 @@ svuint64_t test_svrevd_u64_z(svbool_t pg, svuint64_t op) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> [[INACTIVE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svrevd_s8_m(svint8_t inactive, svbool_t pg, svint8_t op) {
+svint8_t test_svrevd_s8_m(svint8_t inactive, svbool_t pg, svint8_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s8, _m, )(inactive, pg, op);
 }
 
@@ -168,7 +174,7 @@ svint8_t test_svrevd_s8_m(svint8_t inactive, svbool_t pg, svint8_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svrevd_s16_m(svint16_t inactive, svbool_t pg, svint16_t op) {
+svint16_t test_svrevd_s16_m(svint16_t inactive, svbool_t pg, svint16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s16, _m, )(inactive, pg, op);
 }
 
@@ -184,7 +190,7 @@ svint16_t test_svrevd_s16_m(svint16_t inactive, svbool_t pg, svint16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> [[INACTIVE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svrevd_s32_m(svint32_t inactive, svbool_t pg, svint32_t op) {
+svint32_t test_svrevd_s32_m(svint32_t inactive, svbool_t pg, svint32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s32, _m, )(inactive, pg, op);
 }
 
@@ -200,7 +206,7 @@ svint32_t test_svrevd_s32_m(svint32_t inactive, svbool_t pg, svint32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> [[INACTIVE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svrevd_s64_m(svint64_t inactive, svbool_t pg, svint64_t op) {
+svint64_t test_svrevd_s64_m(svint64_t inactive, svbool_t pg, svint64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s64, _m, )(inactive, pg, op);
 }
 
@@ -214,7 +220,7 @@ svint64_t test_svrevd_s64_m(svint64_t inactive, svbool_t pg, svint64_t op) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> [[INACTIVE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svrevd_u8_m(svuint8_t inactive, svbool_t pg, svuint8_t op) {
+svuint8_t test_svrevd_u8_m(svuint8_t inactive, svbool_t pg, svuint8_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u8, _m, )(inactive, pg, op);
 }
 
@@ -230,7 +236,7 @@ svuint8_t test_svrevd_u8_m(svuint8_t inactive, svbool_t pg, svuint8_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svrevd_u16_m(svuint16_t inactive, svbool_t pg, svuint16_t op) {
+svuint16_t test_svrevd_u16_m(svuint16_t inactive, svbool_t pg, svuint16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u16, _m, )(inactive, pg, op);
 }
 
@@ -246,7 +252,7 @@ svuint16_t test_svrevd_u16_m(svuint16_t inactive, svbool_t pg, svuint16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> [[INACTIVE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svrevd_u32_m(svuint32_t inactive, svbool_t pg, svuint32_t op) {
+svuint32_t test_svrevd_u32_m(svuint32_t inactive, svbool_t pg, svuint32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u32, _m, )(inactive, pg, op);
 }
 
@@ -262,7 +268,7 @@ svuint32_t test_svrevd_u32_m(svuint32_t inactive, svbool_t pg, svuint32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> [[INACTIVE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svrevd_u64_m(svuint64_t inactive, svbool_t pg, svuint64_t op) {
+svuint64_t test_svrevd_u64_m(svuint64_t inactive, svbool_t pg, svuint64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u64, _m, )(inactive, pg, op);
 }
 
@@ -276,7 +282,7 @@ svuint64_t test_svrevd_u64_m(svuint64_t inactive, svbool_t pg, svuint64_t op) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svrevd_s8_x(svbool_t pg, svint8_t op) {
+svint8_t test_svrevd_s8_x(svbool_t pg, svint8_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s8, _x, )(pg, op);
 }
 
@@ -292,7 +298,7 @@ svint8_t test_svrevd_s8_x(svbool_t pg, svint8_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svrevd_s16_x(svbool_t pg, svint16_t op) {
+svint16_t test_svrevd_s16_x(svbool_t pg, svint16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s16, _x, )(pg, op);
 }
 
@@ -308,7 +314,7 @@ svint16_t test_svrevd_s16_x(svbool_t pg, svint16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svrevd_s32_x(svbool_t pg, svint32_t op) {
+svint32_t test_svrevd_s32_x(svbool_t pg, svint32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s32, _x, )(pg, op);
 }
 
@@ -324,7 +330,7 @@ svint32_t test_svrevd_s32_x(svbool_t pg, svint32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svrevd_s64_x(svbool_t pg, svint64_t op) {
+svint64_t test_svrevd_s64_x(svbool_t pg, svint64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _s64, _x, )(pg, op);
 }
 
@@ -338,7 +344,7 @@ svint64_t test_svrevd_s64_x(svbool_t pg, svint64_t op) {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svrevd_u8_x(svbool_t pg, svuint8_t op) {
+svuint8_t test_svrevd_u8_x(svbool_t pg, svuint8_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u8, _x, )(pg, op);
 }
 
@@ -354,7 +360,7 @@ svuint8_t test_svrevd_u8_x(svbool_t pg, svuint8_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svrevd_u16_x(svbool_t pg, svuint16_t op) {
+svuint16_t test_svrevd_u16_x(svbool_t pg, svuint16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u16, _x, )(pg, op);
 }
 
@@ -370,7 +376,7 @@ svuint16_t test_svrevd_u16_x(svbool_t pg, svuint16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svrevd_u32_x(svbool_t pg, svuint32_t op) {
+svuint32_t test_svrevd_u32_x(svbool_t pg, svuint32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u32, _x, )(pg, op);
 }
 
@@ -386,7 +392,7 @@ svuint32_t test_svrevd_u32_x(svbool_t pg, svuint32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svrevd_u64_x(svbool_t pg, svuint64_t op) {
+svuint64_t test_svrevd_u64_x(svbool_t pg, svuint64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _u64, _x, )(pg, op);
 }
 
@@ -403,7 +409,7 @@ svuint64_t test_svrevd_u64_x(svbool_t pg, svuint64_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svrevd_bf16_z(svbool_t pg, svbfloat16_t op) {
+svbfloat16_t test_svrevd_bf16_z(svbool_t pg, svbfloat16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _bf16, _z, )(pg, op);
 }
 
@@ -419,7 +425,7 @@ svbfloat16_t test_svrevd_bf16_z(svbool_t pg, svbfloat16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svrevd_f16_z(svbool_t pg, svfloat16_t op) {
+svfloat16_t test_svrevd_f16_z(svbool_t pg, svfloat16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f16, _z, )(pg, op);
 }
 
@@ -435,7 +441,7 @@ svfloat16_t test_svrevd_f16_z(svbool_t pg, svfloat16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svrevd_f32_z(svbool_t pg, svfloat32_t op) {
+svfloat32_t test_svrevd_f32_z(svbool_t pg, svfloat32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f32, _z, )(pg, op);
 }
 
@@ -451,7 +457,7 @@ svfloat32_t test_svrevd_f32_z(svbool_t pg, svfloat32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svrevd_f64_z(svbool_t pg, svfloat64_t op) {
+svfloat64_t test_svrevd_f64_z(svbool_t pg, svfloat64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f64, _z, )(pg, op);
 }
 
@@ -467,7 +473,7 @@ svfloat64_t test_svrevd_f64_z(svbool_t pg, svfloat64_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svrevd_bf16_m(svbfloat16_t inactive, svbool_t pg, svbfloat16_t op) {
+svbfloat16_t test_svrevd_bf16_m(svbfloat16_t inactive, svbool_t pg, svbfloat16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _bf16, _m, )(inactive, pg, op);
 }
 
@@ -483,7 +489,7 @@ svbfloat16_t test_svrevd_bf16_m(svbfloat16_t inactive, svbool_t pg, svbfloat16_t
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svrevd_f16_m(svfloat16_t inactive, svbool_t pg, svfloat16_t op) {
+svfloat16_t test_svrevd_f16_m(svfloat16_t inactive, svbool_t pg, svfloat16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f16, _m, )(inactive, pg, op);
 }
 
@@ -499,7 +505,7 @@ svfloat16_t test_svrevd_f16_m(svfloat16_t inactive, svbool_t pg, svfloat16_t op)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> [[INACTIVE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svrevd_f32_m(svfloat32_t inactive, svbool_t pg, svfloat32_t op) {
+svfloat32_t test_svrevd_f32_m(svfloat32_t inactive, svbool_t pg, svfloat32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f32, _m, )(inactive, pg, op);
 }
 
@@ -515,7 +521,7 @@ svfloat32_t test_svrevd_f32_m(svfloat32_t inactive, svbool_t pg, svfloat32_t op)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> [[INACTIVE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svrevd_f64_m(svfloat64_t inactive, svbool_t pg, svfloat64_t op) {
+svfloat64_t test_svrevd_f64_m(svfloat64_t inactive, svbool_t pg, svfloat64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f64, _m, )(inactive, pg, op);
 }
 
@@ -531,7 +537,7 @@ svfloat64_t test_svrevd_f64_m(svfloat64_t inactive, svbool_t pg, svfloat64_t op)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svrevd_bf16_x(svbool_t pg, svbfloat16_t op) {
+svbfloat16_t test_svrevd_bf16_x(svbool_t pg, svbfloat16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _bf16, _x, )(pg, op);
 }
 
@@ -547,7 +553,7 @@ svbfloat16_t test_svrevd_bf16_x(svbool_t pg, svbfloat16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svrevd_f16_x(svbool_t pg, svfloat16_t op) {
+svfloat16_t test_svrevd_f16_x(svbool_t pg, svfloat16_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f16, _x, )(pg, op);
 }
 
@@ -563,7 +569,7 @@ svfloat16_t test_svrevd_f16_x(svbool_t pg, svfloat16_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svrevd_f32_x(svbool_t pg, svfloat32_t op) {
+svfloat32_t test_svrevd_f32_x(svbool_t pg, svfloat32_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f32, _x, )(pg, op);
 }
 
@@ -579,6 +585,6 @@ svfloat32_t test_svrevd_f32_x(svbool_t pg, svfloat32_t op) {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svrevd_f64_x(svbool_t pg, svfloat64_t op) {
+svfloat64_t test_svrevd_f64_x(svbool_t pg, svfloat64_t op) MODE_ATTR {
   return SVE_ACLE_FUNC(svrevd, _f64, _x, )(pg, op);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
index 9d01ba773694..61b088b08338 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfadd.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svadd, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svadd_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svadd, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svadd_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svadd, _bf16, _x)(pg, op1, op2);
 }
@@ -84,7 +90,7 @@ svbfloat16_t test_svadd_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svadd_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svadd_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svadd, _n_bf16, _m)(pg, op1, op2);
 }
@@ -107,7 +113,7 @@ svbfloat16_t test_svadd_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svadd_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svadd_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svadd, _n_bf16, _z)(pg, op1, op2);
 }
@@ -128,7 +134,7 @@ svbfloat16_t test_svadd_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fadd.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svadd_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svadd_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svadd, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
index ce28a0b0653d..36d20bdf745d 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmax.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmax, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svmax_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmax, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svmax_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmax, _bf16, _x)(pg, op1, op2);
 }
@@ -85,7 +91,7 @@ svbfloat16_t test_svmax_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmax_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmax_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmax, _n_bf16, _m)(pg, op1, op2);
 }
@@ -108,7 +114,7 @@ svbfloat16_t test_svmax_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmax_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmax_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmax, _n_bf16, _z)(pg, op1, op2);
 }
@@ -129,7 +135,7 @@ svbfloat16_t test_svmax_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmax_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmax_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmax, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
index d8fec8c5145e..d89cd777de1e 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmaxnm, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svmaxnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmaxnm, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svmaxnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmaxnm, _bf16, _x)(pg, op1, op2);
 }
@@ -85,7 +91,7 @@ svbfloat16_t test_svmaxnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmaxnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmaxnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _m)(pg, op1, op2);
 }
@@ -108,7 +114,7 @@ svbfloat16_t test_svmaxnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmaxnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmaxnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _z)(pg, op1, op2);
 }
@@ -129,7 +135,7 @@ svbfloat16_t test_svmaxnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmaxnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmaxnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmaxnm, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
index 5efaa37a1464..8bc88572ebc5 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmin.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmin, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svmin_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmin, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svmin_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmin, _bf16, _x)(pg, op1, op2);
 }
@@ -85,7 +91,7 @@ svbfloat16_t test_svmin_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmin_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmin_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmin, _n_bf16, _m)(pg, op1, op2);
 }
@@ -108,7 +114,7 @@ svbfloat16_t test_svmin_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmin_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmin_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmin, _n_bf16, _z)(pg, op1, op2);
 }
@@ -129,7 +135,7 @@ svbfloat16_t test_svmin_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmin_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmin_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmin, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
index 98fd12b3a839..d90b38262624 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfminnm.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svminnm, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svminnm_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svminnm, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svminnm_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svminnm, _bf16, _x)(pg, op1, op2);
 }
@@ -85,7 +91,7 @@ svbfloat16_t test_svminnm_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svminnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svminnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svminnm, _n_bf16, _m)(pg, op1, op2);
 }
@@ -108,7 +114,7 @@ svbfloat16_t test_svminnm_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svminnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svminnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svminnm, _n_bf16, _z)(pg, op1, op2);
 }
@@ -129,7 +135,7 @@ svbfloat16_t test_svminnm_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svminnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svminnm_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svminnm, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
index 0a70466a540d..63c7e3e52ce1 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmla.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmla_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmla_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmla, _bf16, _m)(pg, op1, op2, op3);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svmla_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2,
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmla_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmla_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmla, _bf16, _z)(pg, op1, op2, op3);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svmla_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2,
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmla_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmla_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmla, _bf16, _x)(pg, op1, op2, op3);
 }
@@ -84,7 +90,7 @@ svbfloat16_t test_svmla_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2,
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmla_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmla_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmla, _n_bf16, _m)(pg, op1, op2, op3);
 }
@@ -107,7 +113,7 @@ svbfloat16_t test_svmla_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmla_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmla_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmla, _n_bf16, _z)(pg, op1, op2, op3);
 }
@@ -128,7 +134,7 @@ svbfloat16_t test_svmla_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmla.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmla_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmla_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmla, _n_bf16, _x)(pg, op1, op2, op3);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
index ed71d4e490bb..4898a8765f38 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmls.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmls_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmls_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmls, _bf16, _m)(pg, op1, op2, op3);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svmls_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2,
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmls_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmls_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmls, _bf16, _z)(pg, op1, op2, op3);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svmls_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2,
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmls_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmls_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, svbfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmls, _bf16, _x)(pg, op1, op2, op3);
 }
@@ -84,7 +90,7 @@ svbfloat16_t test_svmls_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2,
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmls_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmls_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmls, _n_bf16, _m)(pg, op1, op2, op3);
 }
@@ -107,7 +113,7 @@ svbfloat16_t test_svmls_n_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmls_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmls_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmls, _n_bf16, _z)(pg, op1, op2, op3);
 }
@@ -128,7 +134,7 @@ svbfloat16_t test_svmls_n_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmls.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmls_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) __arm_streaming_compatible
+svbfloat16_t test_svmls_n_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2, bfloat16_t op3) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmls, _n_bf16, _x)(pg, op1, op2, op3);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
index f0a3664426de..904d74303f74 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmul.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmul, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svmul_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmul, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svmul_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmul, _bf16, _x)(pg, op1, op2);
 }
@@ -85,7 +91,7 @@ svbfloat16_t test_svmul_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmul_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmul_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svmul, _n_bf16, _m)(pg, op1, op2);
 }
@@ -108,7 +114,7 @@ svbfloat16_t test_svmul_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svmul_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmul_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmul, _n_bf16, _z)(pg, op1, op2);
 }
@@ -129,7 +135,7 @@ svbfloat16_t test_svmul_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmul.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svmul_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svmul_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svmul, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
index 6f08ea84ab82..82199f647b60 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfsub.c
@@ -8,6 +8,12 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -target-feature +b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -27,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svsub, _bf16, _m)(pg, op1, op2);
 }
@@ -46,7 +52,7 @@ svbfloat16_t test_svsub_bf16_m(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svsub, _bf16, _z)(pg, op1, op2);
 }
@@ -63,7 +69,7 @@ svbfloat16_t test_svsub_bf16_z(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svsub, _bf16, _x)(pg, op1, op2);
 }
@@ -85,7 +91,7 @@ svbfloat16_t test_svsub_bf16_x(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svsub_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svsub_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svsub, _n_bf16, _m)(pg, op1, op2);
 }
@@ -108,7 +114,7 @@ svbfloat16_t test_svsub_bf16_n_m(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
 //
-svbfloat16_t test_svsub_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svsub_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svsub, _n_bf16, _z)(pg, op1, op2);
 }
@@ -129,7 +135,7 @@ svbfloat16_t test_svsub_bf16_n_z(svbool_t pg, svbfloat16_t op1, bfloat16_t op2)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fsub.u.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[DOTSPLAT]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svsub_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) __arm_streaming_compatible
+svbfloat16_t test_svsub_bf16_n_x(svbool_t pg, svbfloat16_t op1, bfloat16_t op2) MODE_ATTR
 {
    return SVE_ACLE_FUNC(svsub, _n_bf16, _x)(pg, op1, op2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c
index d441c934bad0..b5eb220d7776 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c
@@ -5,19 +5,25 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
@@ -25,12 +31,6 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
-#ifndef TEST_SME2
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 // CHECK-LABEL: @test_svcreate2_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> [[X0:%.*]], i64 0)
@@ -43,7 +43,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP0]], <vscale x 16 x i1> [[X1:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP1]]
 //
-svboolx2_t test_svcreate2_b(svbool_t x0, svbool_t x1) ATTR
+svboolx2_t test_svcreate2_b(svbool_t x0, svbool_t x1) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_b,,)(x0, x1);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c
index e5016d6cb3dc..e3530c142c83 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c
@@ -5,19 +5,25 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
@@ -25,12 +31,6 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
-#ifndef TEST_SME
-#define ATTR
-#else
-#define ATTR __arm_streaming
-#endif
-
 // NOTE: For these tests clang converts the struct parameter into
 // several parameters, one for each member of the original struct.
 // CHECK-LABEL: @test_svget4_b_0(
@@ -43,7 +43,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
-svbool_t test_svget4_b_0(svboolx4_t tuple) ATTR
+svbool_t test_svget4_b_0(svboolx4_t tuple) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svget4,_b,,)(tuple, 0);
 }
@@ -60,7 +60,7 @@ svbool_t test_svget4_b_0(svboolx4_t tuple) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[TUPLE:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
-svbool_t test_svget4_b_1(svboolx4_t tuple) ATTR
+svbool_t test_svget4_b_1(svboolx4_t tuple) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svget4,_b,,)(tuple, 1);
 }
@@ -77,7 +77,7 @@ svbool_t test_svget4_b_1(svboolx4_t tuple) ATTR
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[TUPLE:%.*]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
 //
-svbool_t test_svget4_b_3(svboolx4_t tuple) ATTR
+svbool_t test_svget4_b_3(svboolx4_t tuple) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svget4,_b,,)(tuple, 3);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c
index 5197e41c1ffd..01363681dad2 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_undef_bool.c
@@ -9,6 +9,12 @@
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR __arm_streaming_compatible
+#endif
+
 // CHECK-LABEL: define dso_local <vscale x 32 x i1> @test_svundef2_b(
 // CPP-CHECK-LABEL: define dso_local <vscale x 32 x i1> @_Z15test_svundef2_bv(
 //
@@ -18,7 +24,7 @@
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> undef
 //
-svboolx2_t test_svundef2_b() {
+svboolx2_t test_svundef2_b(void) MODE_ATTR {
     return svundef2_b();
 }
 
@@ -31,6 +37,6 @@ svboolx2_t test_svundef2_b() {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i1> undef
 //
-svboolx4_t test_svundef4_b() {
+svboolx4_t test_svundef4_b(void) MODE_ATTR {
     return svundef4_b();
 }
diff --git a/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
index 4debc14190aa..2012221b4804 100644
--- a/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
+++ b/clang/test/Sema/aarch64-sme2-sve2p1-diagnostics.c
@@ -6,6 +6,8 @@
 
 //svldnt1:
 
+// expected-error at +3 {{SVE vector type 'svcount_t' (aka '__SVCount_t') cannot be used in a non-streaming function}}
+// expected-error at +2 {{SVE vector type 'svuint8x2_t' (aka '__clang_svuint8x2_t') cannot be used in a non-streaming function}}
 __attribute__((target("+sme2")))
 svuint8x2_t sme2_or_sve2p1_intrinsic_test_sme2_invalid(svcount_t png, const uint8_t *rn) {
   // expected-warning at +1 {{builtin call has undefined behaviour when called from a non-streaming function}}
diff --git a/clang/test/Sema/aarch64-sme2p1-diagnostics.c b/clang/test/Sema/aarch64-sme2p1-diagnostics.c
index a0adb0403858..c3c8b4ede305 100644
--- a/clang/test/Sema/aarch64-sme2p1-diagnostics.c
+++ b/clang/test/Sema/aarch64-sme2p1-diagnostics.c
@@ -3,7 +3,7 @@
 // REQUIRES: aarch64-registered-target
 #include "arm_sme.h"
 
-svuint8x2_t  test_sme2p1(svuint8x2_t  x) {
+svuint8x2_t  test_sme2p1(svuint8x2_t  x) __arm_streaming {
   // expected-no-diagnostics
   return x;
 }