[clang] [AArch64][SME] Warn when using a streaming builtin from a non-streaming function (PR #74064)

Thu Dec 7 06:00:47 PST 2023

https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/74064

>From 5fc69175acf2cd5f80853fec494a37b2603b498a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 30 Nov 2023 13:42:50 +0000
Subject: [PATCH 1/7] [AArch64][SME] Warn when using a streaming builtin from a
 non-streaming function

This PR adds a warning that's emitted when a non-streaming or
non-streaming-compatible builtin is called in an unsuitable function.

Uses work by Kerry McLaughlin.
---
 clang/include/clang/Basic/CMakeLists.txt      |  6 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +
 clang/include/clang/Sema/Sema.h               |  1 +
 clang/lib/Sema/SemaChecking.cpp               | 77 +++++++++++++++----
 .../Sema/aarch64-incompat-sm-builtin-calls.c  | 21 +++++
 clang/utils/TableGen/SveEmitter.cpp           | 68 ++++++++++++++++
 clang/utils/TableGen/TableGen.cpp             |  9 +++
 clang/utils/TableGen/TableGenBackends.h       |  1 +
 8 files changed, 173 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 085e316fcc671..bdd72d1d63c43 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -97,6 +97,12 @@ clang_tablegen(arm_sme_builtin_cg.inc -gen-arm-sme-builtin-codegen
 clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks
   SOURCE arm_sme.td
   TARGET ClangARMSmeSemaRangeChecks)
+clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs
+  SOURCE arm_sme.td
+  TARGET ClangARMSmeStreamingAttrs)
+clang_tablegen(arm_sme_builtins_za_state.inc -gen-arm-sme-builtin-za-state
+  SOURCE arm_sme.td
+  TARGET ClangARMSmeBuiltinsZAState)
 clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def
   SOURCE arm_cde.td
   TARGET ClangARMCdeBuiltinsDef)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 6dfb2d7195203..c7036fc881a13 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3151,6 +3151,9 @@ def err_attribute_arm_feature_sve_bits_unsupported : Error<
 def warn_attribute_arm_sm_incompat_builtin : Warning<
   "builtin call has undefined behaviour when called from a %0 function">,
   InGroup<DiagGroup<"undefined-arm-streaming">>;
+def warn_attribute_arm_za_builtin_no_za_state : Warning<
+  "builtin call is not valid when calling from a function without active ZA state">,
+  InGroup<DiagGroup<"undefined-arm-za">>;
 def err_sve_vector_in_non_sve_target : Error<
   "SVE vector type %0 cannot be used in a target without sve">;
 def err_attribute_riscv_rvv_bits_unsupported : Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 26f4e38a46e83..8ad49b3c6f98d 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13843,6 +13843,7 @@ class Sema final {
   bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool ParseSVEImmChecks(CallExpr *TheCall,
                          SmallVector<std::tuple<int, int, int>, 3> &ImmChecks);
+  bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg,
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index fc6ee6b2c5ab4..e2d717d4ff2ce 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3046,18 +3046,6 @@ bool Sema::ParseSVEImmChecks(
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 7))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck1_1:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_3:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 3))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_7:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
-        HasError = true;
-      break;
     case SVETypeFlags::ImmCheckExtract:
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0,
                                       (2048 / ElementSizeInBits) - 1))
@@ -3168,11 +3156,71 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
         << TheCall->getSourceRange() << "streaming compatible";
     return;
   }
+
+  if (FnType == ArmNonStreaming && BuiltinType == ArmStreaming) {
+    S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin)
+        << TheCall->getSourceRange() << "non-streaming";
+  }
+}
+
+static bool hasSMEZAState(const FunctionDecl *FD) {
+  if (FD->hasAttr<ArmNewZAAttr>())
+    return true;
+  if (const auto *T = FD->getType()->getAs<FunctionProtoType>())
+    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask)
+      return true;
+  return false;
+}
+
+static bool hasSMEZAState(unsigned BuiltinID) {
+  switch (BuiltinID) {
+  default:
+    return false;
+#define GET_SME_BUILTIN_HAS_ZA_STATE
+#include "clang/Basic/arm_sme_builtins_za_state.inc"
+#undef GET_SME_BUILTIN_HAS_ZA_STATE
+  }
+}
+
+bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  if (const FunctionDecl *FD = getCurFunctionDecl()) {
+    bool debug = FD->getDeclName().getAsString() == "incompat_sve_sm";
+    std::optional<ArmStreamingType> BuiltinType;
+
+    switch (BuiltinID) {
+    default:
+      break;
+#define GET_SME_STREAMING_ATTRS
+#include "clang/Basic/arm_sme_streaming_attrs.inc"
+#undef GET_SME_STREAMING_ATTRS
+    }
+
+    if (BuiltinType)
+      checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType);
+
+    if (hasSMEZAState(BuiltinID) && !hasSMEZAState(FD))
+      Diag(TheCall->getBeginLoc(),
+           diag::warn_attribute_arm_za_builtin_no_za_state)
+          << TheCall->getSourceRange();
+  }
+
+  // Range check SME intrinsics that take immediate values.
+  SmallVector<std::tuple<int, int, int>, 3> ImmChecks;
+
+  switch (BuiltinID) {
+  default:
+    return false;
+#define GET_SME_IMMEDIATE_CHECK
+#include "clang/Basic/arm_sme_sema_rangechecks.inc"
+#undef GET_SME_IMMEDIATE_CHECK
+  }
+
+  return ParseSVEImmChecks(TheCall, ImmChecks);
 }
 
 bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   // Range check SVE intrinsics that take immediate values.
-  SmallVector<std::tuple<int, int, int>, 3> ImmChecks;
+  SmallVector<std::tuple<int,int,int>, 3> ImmChecks;
 
   switch (BuiltinID) {
   default:
@@ -3569,6 +3617,9 @@ bool Sema::CheckAArch64BuiltinFunctionCall(const TargetInfo &TI,
   if (CheckSVEBuiltinFunctionCall(BuiltinID, TheCall))
     return true;
 
+  if (CheckSMEBuiltinFunctionCall(BuiltinID, TheCall))
+    return true;
+
   // For intrinsics which take an immediate value as part of the instruction,
   // range check them here.
   unsigned i = 0, l = 0, u = 0;
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
index e77e09c443518..8f33075c7a9b5 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -5,6 +5,7 @@
 // REQUIRES: aarch64-registered-target
 
 #include "arm_neon.h"
+#include "arm_sme_draft_spec_subject_to_change.h"
 
 int16x8_t incompat_neon_sm(int16x8_t splat) __arm_streaming {
   // expected-warning at +1 {{builtin call has undefined behaviour when called from a streaming function}}
@@ -20,3 +21,23 @@ int16x8_t incompat_neon_smc(int16x8_t splat) __arm_streaming_compatible {
   // expected-warning at +1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
   return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
 }
+
+void incompat_sme_norm(svbool_t pg, void const *ptr) __arm_shared_za {
+  // expected-warning at +1 {{builtin call has undefined behaviour when called from a non-streaming function}}
+  return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr);
+}
+
+void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_shared_za {
+  // expected-warning at +1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
+  return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr);
+}
+
+void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_shared_za {
+  // expected-warning at +1 {{builtin call has undefined behaviour when called from a non-streaming function}}
+  svmops_za32_f32_m(0, pn, pm, zn, zm);
+}
+
+svbool_t streaming_caller_ptrue(void) __arm_streaming {
+  // expected-no-warning
+  return svand_z(svptrue_b16(), svptrue_pat_b16(SV_ALL), svptrue_pat_b16(SV_VL4));
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b8a1fb4bab0f4..b4809db9b9ec4 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -378,6 +378,9 @@ class SVEEmitter {
   /// Emit all the information needed to map builtin -> LLVM IR intrinsic.
   void createSMECodeGenMap(raw_ostream &o);
 
+  /// Create a table for a builtin's requirement for PSTATE.SM.
+  void createStreamingAttrs(raw_ostream &o, ACLEKind Kind);
+
   /// Emit all the range checks for the immediates.
   void createSMERangeChecks(raw_ostream &o);
 
@@ -1375,6 +1378,12 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
   OS << "#define __aio static __inline__ __attribute__((__always_inline__, "
         "__nodebug__, __overloadable__))\n\n";
 
+  OS << "#ifdef __ARM_FEATURE_SME\n";
+  OS << "#define __asc __attribute__((arm_streaming_compatible))\n";
+  OS << "#else\n";
+  OS << "#define __asc\n";
+  OS << "#endif\n\n";
+
   // Add reinterpret functions.
   for (auto [N, Suffix] :
        std::initializer_list<std::pair<unsigned, const char *>>{
@@ -1694,6 +1703,61 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
   OS << "#endif\n\n";
 }
 
+void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  // The mappings must be sorted based on BuiltinID.
+  llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
+                      const std::unique_ptr<Intrinsic> &B) {
+    return A->getMangledName() < B->getMangledName();
+  });
+
+  switch (Kind) {
+  case ACLEKind::SME:
+    OS << "#ifdef GET_SME_STREAMING_ATTRS\n";
+    break;
+  case ACLEKind::SVE:
+    OS << "#ifdef GET_SVE_STREAMING_ATTRS\n";
+    break;
+  }
+
+  // Ensure these are only emitted once.
+  std::set<std::string> Emitted;
+
+  uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming");
+  uint64_t IsStreamingCompatibleFlag =
+      getEnumValueForFlag("IsStreamingCompatible");
+  for (auto &Def : Defs) {
+    if (Emitted.find(Def->getMangledName()) != Emitted.end())
+      continue;
+
+    switch (Kind) {
+    case ACLEKind::SME:
+      OS << "case SME::BI__builtin_sme_";
+      break;
+    case ACLEKind::SVE:
+      OS << "case SVE::BI__builtin_sve_";
+      break;
+    }
+    OS << Def->getMangledName() << ":\n";
+
+    if (Def->isFlagSet(IsStreamingFlag))
+      OS << "  BuiltinType = ArmStreaming;\n";
+    else if (Def->isFlagSet(IsStreamingCompatibleFlag))
+      OS << "  BuiltinType = ArmStreamingCompatible;\n";
+    else
+      OS << "  BuiltinType = ArmNonStreaming;\n";
+    OS << "  break;\n";
+
+    Emitted.insert(Def->getMangledName());
+  }
+
+  OS << "#endif\n\n";
+}
+
 namespace clang {
 void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createHeader(OS);
@@ -1730,4 +1794,8 @@ void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
 void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMERangeChecks(OS);
 }
+
+void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME);
+}
 } // End namespace clang
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 7efb6c731d3e5..9ba2fb07f1380 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -89,6 +89,8 @@ enum ActionType {
   GenArmSmeBuiltins,
   GenArmSmeBuiltinCG,
   GenArmSmeRangeChecks,
+  GenArmSmeStreamingAttrs,
+  GenArmSmeBuiltinZAState,
   GenArmCdeHeader,
   GenArmCdeBuiltinDef,
   GenArmCdeBuiltinSema,
@@ -251,6 +253,10 @@ cl::opt<ActionType> Action(
                    "Generate arm_sme_builtin_cg_map.inc for clang"),
         clEnumValN(GenArmSmeRangeChecks, "gen-arm-sme-sema-rangechecks",
                    "Generate arm_sme_sema_rangechecks.inc for clang"),
+        clEnumValN(GenArmSmeStreamingAttrs, "gen-arm-sme-streaming-attrs",
+                   "Generate arm_sme_streaming_attrs.inc for clang"),
+        clEnumValN(GenArmSmeBuiltinZAState, "gen-arm-sme-builtin-za-state",
+                   "Generate arm_sme_builtins_za_state.inc for clang"),
         clEnumValN(GenArmMveHeader, "gen-arm-mve-header",
                    "Generate arm_mve.h for clang"),
         clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def",
@@ -500,6 +506,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenArmSmeRangeChecks:
     EmitSmeRangeChecks(Records, OS);
     break;
+  case GenArmSmeStreamingAttrs:
+    EmitSmeStreamingAttrs(Records, OS);
+    break;
   case GenArmCdeHeader:
     EmitCdeHeader(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index d8f447069376b..2f1c96bfa5964 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -109,6 +109,7 @@ void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);

>From a0dd6bea30a030210573b176c982f31487de1c3c Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 1 Dec 2023 16:08:21 +0000
Subject: [PATCH 2/7] fixup! adjust tests

---
 .../aarch64-sme-intrinsics/acle_sme_add-i32.c |  16 +-
 .../aarch64-sme-intrinsics/acle_sme_add-i64.c |  16 +-
 .../acle_sme_mopa-za32.c                      |  14 +-
 .../acle_sme_mopa-za64.c                      |  10 +-
 .../acle_sme_mops-za32.c                      |  14 +-
 .../acle_sme_mops-za64.c                      |  10 +-
 .../aarch64-sme-intrinsics/acle_sme_read.c    | 192 +++++++++---------
 .../aarch64-sme-intrinsics/acle_sme_write.c   | 192 +++++++++---------
 .../aarch64-sme-intrinsics/acle_sme_target.c  |  10 +-
 9 files changed, 237 insertions(+), 237 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index e444321be41b2..ee6c1c9dd566b 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -50,7 +50,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -70,7 +70,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -90,7 +90,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn);
 }
 
@@ -110,7 +110,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -130,7 +130,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -150,7 +150,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -170,7 +170,7 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index 4b2f71d607eb6..254ea89d22c50 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -50,7 +50,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -70,7 +70,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -90,7 +90,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn);
 }
 
@@ -110,7 +110,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -130,7 +130,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -150,7 +150,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -170,7 +170,7 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index 88206eec76b97..b90c9be4a6e09 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
+void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,7 +42,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
+void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -62,7 +62,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) {
+void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -82,7 +82,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) {
+void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -102,7 +102,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) {
+void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -118,7 +118,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) {
+void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming {
  SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -134,7 +134,7 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) {
+void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming {
   SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index 1a1ab1e00d0ec..a56ce4d17f126 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) {
+void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
@@ -50,7 +50,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) {
+void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -70,7 +70,7 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) {
+void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) {
+void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming {
  SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -110,7 +110,7 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) {
+void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index 1633cc6ed32e0..abcf4c2e698d7 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
+void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,7 +42,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
+void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -62,7 +62,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) {
+void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -82,7 +82,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) {
+void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -102,7 +102,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) {
+void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -118,7 +118,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) {
+void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming {
  SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -134,7 +134,7 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) {
+void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming {
   SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index b17df30c0f851..b26b9e4e51e05 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) {
+void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
@@ -50,7 +50,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) {
+void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -70,7 +70,7 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) {
+void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming {
   SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) {
+void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming {
  SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -110,7 +110,7 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) {
+void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming {
   SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index d63900129f994..a15599d186a87 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) {
+svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -44,7 +44,7 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) {
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) {
+svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice);
 }
@@ -63,7 +63,7 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) {
+svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
      return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -83,7 +83,7 @@ svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) {
+svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
      uint32_t slice = slice_base + 7;
      return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice);
 }
@@ -102,7 +102,7 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) {
+svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -122,7 +122,7 @@ svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice);
 }
@@ -141,7 +141,7 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -161,7 +161,7 @@ svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice);
 }
@@ -178,7 +178,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -196,7 +196,7 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice);
 }
@@ -215,7 +215,7 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -235,7 +235,7 @@ svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice);
 }
@@ -254,7 +254,7 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -274,7 +274,7 @@ svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice);
 }
@@ -293,7 +293,7 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -313,7 +313,7 @@ svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice);
 }
@@ -332,7 +332,7 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -352,7 +352,7 @@ svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice);
 }
@@ -371,7 +371,7 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -391,7 +391,7 @@ svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice);
 }
@@ -410,7 +410,7 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -430,7 +430,7 @@ svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice);
 }
@@ -449,7 +449,7 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -469,7 +469,7 @@ svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice);
 }
@@ -486,7 +486,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -502,7 +502,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -520,7 +520,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -538,7 +538,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -556,7 +556,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -574,7 +574,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -592,7 +592,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -610,7 +610,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -626,7 +626,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -642,7 +642,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -660,7 +660,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -678,7 +678,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -696,7 +696,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -714,7 +714,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -732,7 +732,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -750,7 +750,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -768,7 +768,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -786,7 +786,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -804,7 +804,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -822,7 +822,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -840,7 +840,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -858,7 +858,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -876,7 +876,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -894,7 +894,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -910,7 +910,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -928,7 +928,7 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice);
 }
@@ -947,7 +947,7 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
      return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -967,7 +967,7 @@ svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
      uint32_t slice = slice_base + 7;
      return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice);
 }
@@ -986,7 +986,7 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1006,7 +1006,7 @@ svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice);
 }
@@ -1025,7 +1025,7 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1045,7 +1045,7 @@ svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice);
 }
@@ -1062,7 +1062,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1080,7 +1080,7 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice);
 }
@@ -1099,7 +1099,7 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1119,7 +1119,7 @@ svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice);
 }
@@ -1138,7 +1138,7 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1158,7 +1158,7 @@ svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice);
 }
@@ -1177,7 +1177,7 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1197,7 +1197,7 @@ svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice);
 }
@@ -1216,7 +1216,7 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1236,7 +1236,7 @@ svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice);
 }
@@ -1255,7 +1255,7 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1275,7 +1275,7 @@ svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice);
 }
@@ -1294,7 +1294,7 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1314,7 +1314,7 @@ svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice);
 }
@@ -1333,7 +1333,7 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1353,7 +1353,7 @@ svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice);
 }
@@ -1370,7 +1370,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1386,7 +1386,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1404,7 +1404,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1422,7 +1422,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1440,7 +1440,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1458,7 +1458,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1476,7 +1476,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1494,7 +1494,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1510,7 +1510,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1526,7 +1526,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1544,7 +1544,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1562,7 +1562,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1580,7 +1580,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1598,7 +1598,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1616,7 +1616,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1634,7 +1634,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
+svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1652,7 +1652,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1670,7 +1670,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1688,7 +1688,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1706,7 +1706,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
+svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1724,7 +1724,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1742,7 +1742,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1760,7 +1760,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1778,7 +1778,7 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
+svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index 3f11aa8c5eb2e..2cc338add314b 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -44,7 +44,7 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
    uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn);
 }
@@ -63,7 +63,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -83,7 +83,7 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn);
 }
@@ -102,7 +102,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -122,7 +122,7 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn);
 }
@@ -141,7 +141,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -161,7 +161,7 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn);
 }
@@ -178,7 +178,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -196,7 +196,7 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn);
 }
@@ -215,7 +215,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -235,7 +235,7 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn);
 }
@@ -254,7 +254,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -274,7 +274,7 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn);
 }
@@ -293,7 +293,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -313,7 +313,7 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn);
 }
@@ -332,7 +332,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -352,7 +352,7 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn);
 }
@@ -371,7 +371,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -391,7 +391,7 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
    uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn);
 }
@@ -410,7 +410,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -430,7 +430,7 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn);
 }
@@ -449,7 +449,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -469,7 +469,7 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn);
 }
@@ -486,7 +486,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -502,7 +502,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
@@ -520,7 +520,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -538,7 +538,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
@@ -556,7 +556,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -574,7 +574,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
@@ -592,7 +592,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -610,7 +610,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
@@ -626,7 +626,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -642,7 +642,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
@@ -660,7 +660,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -678,7 +678,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
@@ -696,7 +696,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -714,7 +714,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
@@ -732,7 +732,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -750,7 +750,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
@@ -768,7 +768,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -786,7 +786,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
@@ -804,7 +804,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -822,7 +822,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
@@ -840,7 +840,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -858,7 +858,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
@@ -876,7 +876,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -894,7 +894,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 
@@ -910,7 +910,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -928,7 +928,7 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn);
 }
@@ -947,7 +947,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -967,7 +967,7 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn);
 }
@@ -986,7 +986,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1006,7 +1006,7 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn);
 }
@@ -1025,7 +1025,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1045,7 +1045,7 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn);
 }
@@ -1062,7 +1062,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1080,7 +1080,7 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn);
 }
@@ -1099,7 +1099,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1119,7 +1119,7 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn);
 }
@@ -1138,7 +1138,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1158,7 +1158,7 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn);
 }
@@ -1177,7 +1177,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1197,7 +1197,7 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn);
 }
@@ -1216,7 +1216,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1236,7 +1236,7 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn);
 }
@@ -1255,7 +1255,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1275,7 +1275,7 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn);
 }
@@ -1294,7 +1294,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1314,7 +1314,7 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn);
 }
@@ -1333,7 +1333,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1353,7 +1353,7 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn);
 }
@@ -1370,7 +1370,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1386,7 +1386,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
+void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
@@ -1404,7 +1404,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1422,7 +1422,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
+void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1440,7 +1440,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1458,7 +1458,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
+void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1476,7 +1476,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1494,7 +1494,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
+void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
@@ -1510,7 +1510,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1526,7 +1526,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
+void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
@@ -1544,7 +1544,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1562,7 +1562,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
+void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1580,7 +1580,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1598,7 +1598,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
+void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1616,7 +1616,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1634,7 +1634,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
+void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
@@ -1652,7 +1652,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1670,7 +1670,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
+void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1688,7 +1688,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1706,7 +1706,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
+void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1724,7 +1724,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1742,7 +1742,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
+void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1760,7 +1760,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1778,7 +1778,7 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
+void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
   SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index 2de6d9f6877f0..812ada00ceb39 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -6,20 +6,20 @@
 #include <arm_sme_draft_spec_subject_to_change.h>
 
 __attribute__((target("sme")))
-void test_sme(svbool_t pg, void *ptr) {
+void test_sme(svbool_t pg, void *ptr) __arm_streaming {
   svld1_hor_za8(0, 0, pg, ptr);
 }
 
 __attribute__((target("arch=armv8-a+sme")))
-void test_arch_sme(svbool_t pg, void *ptr) {
+void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming {
   svld1_hor_vnum_za32(0, 0, pg, ptr, 0);
 }
 
 __attribute__((target("+sme")))
-void test_plus_sme(svbool_t pg, void *ptr) {
+void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming {
   svst1_ver_za16(0, 0, pg, ptr);
 }
 
-void undefined(svbool_t pg, void *ptr) {
-  svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-error {{'svst1_ver_vnum_za64' needs target feature sme}}
+void undefined(svbool_t pg, void *ptr) __arm_streaming { // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+  svst1_ver_vnum_za64(0, 0, pg, ptr, 0);
 }

>From 13974b81a5d2c795ce3f37844f16d66284ec3b05 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 5 Dec 2023 11:10:50 +0000
Subject: [PATCH 3/7] fixup: add sme to target test

---
 clang/lib/Sema/SemaChecking.cpp                          | 1 -
 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e2d717d4ff2ce..1c7174854ec7a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3184,7 +3184,6 @@ static bool hasSMEZAState(unsigned BuiltinID) {
 
 bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   if (const FunctionDecl *FD = getCurFunctionDecl()) {
-    bool debug = FD->getDeclName().getAsString() == "incompat_sve_sm";
     std::optional<ArmStreamingType> BuiltinType;
 
     switch (BuiltinID) {
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index 812ada00ceb39..7cfe9fdfbd24f 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -20,6 +20,7 @@ void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming {
   svst1_ver_za16(0, 0, pg, ptr);
 }
 
-void undefined(svbool_t pg, void *ptr) __arm_streaming { // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
-  svst1_ver_vnum_za64(0, 0, pg, ptr, 0);
+__attribute__((target("+sme")))
+void undefined(svbool_t pg, void *ptr) {
+  svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}}
 }

>From 9d9e13dc2c8fe2bc19b3ce22518ad58d3a96e8d0 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 5 Dec 2023 11:23:53 +0000
Subject: [PATCH 4/7] fixup: use ParseSVEImmChecks in
 CheckSVEBuiltinFunctionCall

---
 clang/lib/Sema/SemaChecking.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 1c7174854ec7a..922c8d94632bb 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3117,6 +3117,18 @@ bool Sema::ParseSVEImmChecks(
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 255))
         HasError = true;
       break;
+    case SVETypeFlags::ImmCheck1_1:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 1))
+        HasError = true;
+      break;
+    case SVETypeFlags::ImmCheck1_3:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 3))
+        HasError = true;
+      break;
+    case SVETypeFlags::ImmCheck1_7:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
+        HasError = true;
+      break;
     case SVETypeFlags::ImmCheck2_4_Mul2:
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
           SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2))

>From 27075ede887854d29c54be79f7dfafbbb7b38a2c Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 5 Dec 2023 16:44:35 +0000
Subject: [PATCH 5/7] fixup: add za state tablegen generation

---
 .../aarch64-sme-intrinsics/acle_sme_target.c  |  8 ++--
 clang/utils/TableGen/SveEmitter.cpp           | 41 +++++++++++++++++++
 clang/utils/TableGen/TableGen.cpp             |  3 ++
 clang/utils/TableGen/TableGenBackends.h       |  1 +
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index 7cfe9fdfbd24f..95bb6be2d2d34 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -6,21 +6,21 @@
 #include <arm_sme_draft_spec_subject_to_change.h>
 
 __attribute__((target("sme")))
-void test_sme(svbool_t pg, void *ptr) __arm_streaming {
+void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_za8(0, 0, pg, ptr);
 }
 
 __attribute__((target("arch=armv8-a+sme")))
-void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming {
+void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_vnum_za32(0, 0, pg, ptr, 0);
 }
 
 __attribute__((target("+sme")))
-void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming {
+void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_ver_za16(0, 0, pg, ptr);
 }
 
 __attribute__((target("+sme")))
-void undefined(svbool_t pg, void *ptr) {
+void undefined(svbool_t pg, void *ptr) __arm_shared_za {
   svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}}
 }
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b4809db9b9ec4..7605d343e7786 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -384,6 +384,9 @@ class SVEEmitter {
   /// Emit all the range checks for the immediates.
   void createSMERangeChecks(raw_ostream &o);
 
+  /// Create a table for a builtin's requirement for PSTATE.ZA.
+  void createBuiltinZAState(raw_ostream &OS);
+
   /// Create intrinsic and add it to \p Out
   void createIntrinsic(Record *R,
                        SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
@@ -1703,6 +1706,40 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
   OS << "#endif\n\n";
 }
 
+void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  // The mappings must be sorted based on BuiltinID.
+  llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
+                      const std::unique_ptr<Intrinsic> &B) {
+    return A->getMangledName() < B->getMangledName();
+  });
+
+  OS << "#ifdef GET_SME_BUILTIN_HAS_ZA_STATE\n";
+
+  // Ensure these are only emitted once.
+  std::set<std::string> Emitted;
+
+  uint64_t IsSharedZAFlag = getEnumValueForFlag("IsSharedZA");
+  for (auto &Def : Defs) {
+    if (Emitted.find(Def->getMangledName()) != Emitted.end())
+      continue;
+
+    OS << "case SME::BI__builtin_sme_" << Def->getMangledName() << ":\n";
+    if (Def->isFlagSet(IsSharedZAFlag))
+      OS << "  return true;\n";
+    else
+      OS << "  return false;\n";
+
+    Emitted.insert(Def->getMangledName());
+  }
+
+  OS << "#endif\n\n";
+}
+
 void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
@@ -1798,4 +1835,8 @@ void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
 void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME);
 }
+
+void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createBuiltinZAState(OS);
+}
 } // End namespace clang
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 9ba2fb07f1380..89e90adf7f334 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -509,6 +509,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenArmSmeStreamingAttrs:
     EmitSmeStreamingAttrs(Records, OS);
     break;
+  case GenArmSmeBuiltinZAState:
+    EmitSmeBuiltinZAState(Records, OS);
+    break;
   case GenArmCdeHeader:
     EmitCdeHeader(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 2f1c96bfa5964..dabbf9d2055a7 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -110,6 +110,7 @@ void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);

>From 80c0cda9b88a7d14fd6184b9b6ce2a4345b71a79 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 6 Dec 2023 12:45:12 +0000
Subject: [PATCH 6/7] fixup: fix za state tests

---
 clang/lib/Sema/SemaChecking.cpp               |  34 ++--
 .../aarch64-sme-intrinsics/acle_sme_add-i32.c |  16 +-
 .../aarch64-sme-intrinsics/acle_sme_add-i64.c |  16 +-
 .../aarch64-sme-intrinsics/acle_sme_ld1.c     |  20 +-
 .../acle_sme_ld1_vnum.c                       |  20 +-
 .../aarch64-sme-intrinsics/acle_sme_ldr.c     |  10 +-
 .../acle_sme_mopa-za32.c                      |  14 +-
 .../acle_sme_mopa-za64.c                      |  10 +-
 .../acle_sme_mops-za32.c                      |  14 +-
 .../acle_sme_mops-za64.c                      |  10 +-
 .../aarch64-sme-intrinsics/acle_sme_read.c    | 192 +++++++++---------
 .../aarch64-sme-intrinsics/acle_sme_st1.c     |  20 +-
 .../acle_sme_st1_vnum.c                       |  20 +-
 .../aarch64-sme-intrinsics/acle_sme_str.c     |  10 +-
 .../aarch64-sme-intrinsics/acle_sme_write.c   | 192 +++++++++---------
 .../aarch64-sme-intrinsics/acle_sme_zero.c    |   8 +-
 16 files changed, 303 insertions(+), 303 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 922c8d94632bb..c7de8db15247c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3046,6 +3046,23 @@ bool Sema::ParseSVEImmChecks(
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 7))
         HasError = true;
       break;
+    case SVETypeFlags::ImmCheck1_1:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 1))
+        HasError = true;
+      break;
+    case SVETypeFlags::ImmCheck1_3:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 3))
+        HasError = true;
+      break;
+    case SVETypeFlags::ImmCheck1_7:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
+        HasError = true;
+      break;
+    case SVETypeFlags::ImmCheck2_4_Mul2:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
+          SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2))
+        HasError = true;
+      break;
     case SVETypeFlags::ImmCheckExtract:
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0,
                                       (2048 / ElementSizeInBits) - 1))
@@ -3117,23 +3134,6 @@ bool Sema::ParseSVEImmChecks(
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 255))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck1_1:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 1))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_3:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 3))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck1_7:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
-        HasError = true;
-      break;
-    case SVETypeFlags::ImmCheck2_4_Mul2:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
-          SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2))
-        HasError = true;
-      break;
     }
   }
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index ee6c1c9dd566b..55d2e355897f7 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
+void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -50,7 +50,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
+void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -70,7 +70,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
+void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -90,7 +90,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
+void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn);
 }
 
@@ -110,7 +110,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
+void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -130,7 +130,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming {
+void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -150,7 +150,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
+void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -170,7 +170,7 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming {
+void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index 254ea89d22c50..8e9c2e7da46a3 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
+void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -50,7 +50,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
+void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -70,7 +70,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
+void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -90,7 +90,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
+void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn);
 }
 
@@ -110,7 +110,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
+void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -130,7 +130,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming {
+void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -150,7 +150,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
+void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -170,7 +170,7 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming {
+void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
index 5622568c4cd76..e17782db222b6 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -22,7 +22,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_za8(0, slice_base, pg, ptr);
   svld1_hor_za8(0, slice_base + 15, pg, ptr);
 }
@@ -45,7 +45,7 @@ void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_za16(0, slice_base, pg, ptr);
   svld1_hor_za16(1, slice_base + 7, pg, ptr);
 }
@@ -68,7 +68,7 @@ void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_za32(0, slice_base, pg, ptr);
   svld1_hor_za32(3, slice_base + 3, pg, ptr);
 }
@@ -91,7 +91,7 @@ void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_za64(0, slice_base, pg, ptr);
   svld1_hor_za64(7, slice_base + 1, pg, ptr);
 }
@@ -112,7 +112,7 @@ void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_hor_za128(0, slice_base, pg, ptr);
   svld1_hor_za128(15, slice_base, pg, ptr);
 }
@@ -133,7 +133,7 @@ void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_ver_za8(0, slice_base, pg, ptr);
   svld1_ver_za8(0, slice_base + 15, pg, ptr);
 }
@@ -156,7 +156,7 @@ void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_ver_za16(0, slice_base, pg, ptr);
   svld1_ver_za16(1, slice_base + 7, pg, ptr);
 }
@@ -179,7 +179,7 @@ void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_ver_za32(0, slice_base, pg, ptr);
   svld1_ver_za32(3, slice_base + 3, pg, ptr);
 }
@@ -202,7 +202,7 @@ void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_ver_za64(0, slice_base, pg, ptr);
   svld1_ver_za64(7, slice_base + 1, pg, ptr);
 }
@@ -223,7 +223,7 @@ void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming {
+void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
   svld1_ver_za128(0, slice_base, pg, ptr);
   svld1_ver_za128(15, slice_base, pg, ptr);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
index 0fe7dcfc0a799..0fa77e1144a7d 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -28,7 +28,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -57,7 +57,7 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -86,7 +86,7 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -115,7 +115,7 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -142,7 +142,7 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
@@ -169,7 +169,7 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -198,7 +198,7 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -227,7 +227,7 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -256,7 +256,7 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -283,7 +283,7 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming {
+void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index 9af0778e89c5e..314c9645dd4f7 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -12,7 +12,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
+void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_shared_za {
   svldr_vnum_za(slice_base, ptr, 0);
 }
 
@@ -22,7 +22,7 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) {
+void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_shared_za {
   svldr_vnum_za(slice_base, ptr, 15);
 }
 
@@ -32,7 +32,7 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_za(uint32_t slice_base, const void *ptr) {
+void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_shared_za {
   svldr_za(slice_base, ptr);
 }
 
@@ -43,7 +43,7 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) {
+void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_shared_za {
   svldr_vnum_za(slice_base, ptr, vnum);
 }
 
@@ -53,6 +53,6 @@ void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) {
+void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_shared_za {
   svldr_vnum_za(slice_base, ptr, 16);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index b90c9be4a6e09..e84f31c2dfa92 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming {
+void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,7 +42,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming {
+void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -62,7 +62,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming {
+void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -82,7 +82,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming {
+void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -102,7 +102,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming {
+void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -118,7 +118,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming {
+void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
  SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -134,7 +134,7 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming {
+void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index a56ce4d17f126..1b22eb64e9e36 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming {
+void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
@@ -50,7 +50,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming {
+void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -70,7 +70,7 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming {
+void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming {
+void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
  SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -110,7 +110,7 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming {
+void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index abcf4c2e698d7..0ff97ff92f714 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming {
+void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,7 +42,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming {
+void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -62,7 +62,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming {
+void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -82,7 +82,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming {
+void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -102,7 +102,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming {
+void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -118,7 +118,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming {
+void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
  SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -134,7 +134,7 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming {
+void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index b26b9e4e51e05..5b190a7f9b748 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming {
+void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
@@ -50,7 +50,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming {
+void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -70,7 +70,7 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming {
+void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming {
+void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
  SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -110,7 +110,7 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming {
+void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index a15599d186a87..843a96da90278 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -44,7 +44,7 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice);
 }
@@ -63,7 +63,7 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
      return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -83,7 +83,7 @@ svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
      uint32_t slice = slice_base + 7;
      return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice);
 }
@@ -102,7 +102,7 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -122,7 +122,7 @@ svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice);
 }
@@ -141,7 +141,7 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -161,7 +161,7 @@ svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice);
 }
@@ -178,7 +178,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -196,7 +196,7 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice);
 }
@@ -215,7 +215,7 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -235,7 +235,7 @@ svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice);
 }
@@ -254,7 +254,7 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -274,7 +274,7 @@ svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice);
 }
@@ -293,7 +293,7 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -313,7 +313,7 @@ svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice);
 }
@@ -332,7 +332,7 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -352,7 +352,7 @@ svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice);
 }
@@ -371,7 +371,7 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -391,7 +391,7 @@ svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice);
 }
@@ -410,7 +410,7 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -430,7 +430,7 @@ svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice);
 }
@@ -449,7 +449,7 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -469,7 +469,7 @@ svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice);
 }
@@ -486,7 +486,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -502,7 +502,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -520,7 +520,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -538,7 +538,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -556,7 +556,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -574,7 +574,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -592,7 +592,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -610,7 +610,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -626,7 +626,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -642,7 +642,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -660,7 +660,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -678,7 +678,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -696,7 +696,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -714,7 +714,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -732,7 +732,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -750,7 +750,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -768,7 +768,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -786,7 +786,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -804,7 +804,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -822,7 +822,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -840,7 +840,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -858,7 +858,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -876,7 +876,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -894,7 +894,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -910,7 +910,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -928,7 +928,7 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice);
 }
@@ -947,7 +947,7 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
      return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -967,7 +967,7 @@ svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
      uint32_t slice = slice_base + 7;
      return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice);
 }
@@ -986,7 +986,7 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1006,7 +1006,7 @@ svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice);
 }
@@ -1025,7 +1025,7 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1045,7 +1045,7 @@ svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice);
 }
@@ -1062,7 +1062,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1080,7 +1080,7 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice);
 }
@@ -1099,7 +1099,7 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1119,7 +1119,7 @@ svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice);
 }
@@ -1138,7 +1138,7 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1158,7 +1158,7 @@ svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice);
 }
@@ -1177,7 +1177,7 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1197,7 +1197,7 @@ svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice);
 }
@@ -1216,7 +1216,7 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1236,7 +1236,7 @@ svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice);
 }
@@ -1255,7 +1255,7 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1275,7 +1275,7 @@ svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice);
 }
@@ -1294,7 +1294,7 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1314,7 +1314,7 @@ svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice);
 }
@@ -1333,7 +1333,7 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1353,7 +1353,7 @@ svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice);
 }
@@ -1370,7 +1370,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1386,7 +1386,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1404,7 +1404,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1422,7 +1422,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1440,7 +1440,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1458,7 +1458,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1476,7 +1476,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1494,7 +1494,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1510,7 +1510,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1526,7 +1526,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1544,7 +1544,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1562,7 +1562,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1580,7 +1580,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1598,7 +1598,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1616,7 +1616,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1634,7 +1634,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1652,7 +1652,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1670,7 +1670,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1688,7 +1688,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1706,7 +1706,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1724,7 +1724,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1742,7 +1742,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1760,7 +1760,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1778,7 +1778,7 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
     return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
index 9ed158aedf7e3..98ebbefc2e74c 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -22,7 +22,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_hor_za8(0, slice_base, pg, ptr);
   svst1_hor_za8(0, slice_base + 15, pg, ptr);
 }
@@ -45,7 +45,7 @@ void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_hor_za16(0, slice_base, pg, ptr);
   svst1_hor_za16(1, slice_base + 7, pg, ptr);
 }
@@ -68,7 +68,7 @@ void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_hor_za32(0, slice_base, pg, ptr);
   svst1_hor_za32(3, slice_base + 3, pg, ptr);
 }
@@ -91,7 +91,7 @@ void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_hor_za64(0, slice_base, pg, ptr);
   svst1_hor_za64(7, slice_base + 1, pg, ptr);
 }
@@ -112,7 +112,7 @@ void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_hor_za128(0, slice_base, pg, ptr);
   svst1_hor_za128(15, slice_base, pg, ptr);
 }
@@ -133,7 +133,7 @@ void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_str
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_ver_za8(0, slice_base, pg, ptr);
   svst1_ver_za8(0, slice_base + 15, pg, ptr);
 }
@@ -156,7 +156,7 @@ void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_ver_za16(0, slice_base, pg, ptr);
   svst1_ver_za16(1, slice_base + 7, pg, ptr);
 }
@@ -179,7 +179,7 @@ void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_ver_za32(0, slice_base, pg, ptr);
   svst1_ver_za32(3, slice_base + 3, pg, ptr);
 }
@@ -202,7 +202,7 @@ void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_ver_za64(0, slice_base, pg, ptr);
   svst1_ver_za64(7, slice_base + 1, pg, ptr);
 }
@@ -223,7 +223,7 @@ void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming {
+void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
   svst1_ver_za128(0, slice_base, pg, ptr);
   svst1_ver_za128(15, slice_base, pg, ptr);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
index 627098d9365bd..938e62a15c771 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -28,7 +28,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -57,7 +57,7 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -86,7 +86,7 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -115,7 +115,7 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -142,7 +142,7 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
@@ -169,7 +169,7 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -198,7 +198,7 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -227,7 +227,7 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -256,7 +256,7 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -283,7 +283,7 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming {
+void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
   svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index baadfc18563a0..bcf368bc8dce4 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -12,7 +12,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za(uint32_t slice_base, void *ptr) {
+void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_shared_za {
   svstr_vnum_za(slice_base, ptr, 0);
 }
 
@@ -22,7 +22,7 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) {
+void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_shared_za {
   svstr_vnum_za(slice_base, ptr, 15);
 }
 
@@ -32,7 +32,7 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_za(uint32_t slice_base, void *ptr) {
+void test_svstr_za(uint32_t slice_base, void *ptr) __arm_shared_za {
   svstr_za(slice_base, ptr);
 }
 
@@ -43,7 +43,7 @@ void test_svstr_za(uint32_t slice_base, void *ptr) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) {
+void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_shared_za {
   svstr_vnum_za(slice_base, ptr, vnum);
 }
 
@@ -53,6 +53,6 @@ void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) {
+void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_shared_za {
   svstr_vnum_za(slice_base, ptr, 16);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index 2cc338add314b..38c8402c3d0fa 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -44,7 +44,7 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
    uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn);
 }
@@ -63,7 +63,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -83,7 +83,7 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn);
 }
@@ -102,7 +102,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -122,7 +122,7 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn);
 }
@@ -141,7 +141,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -161,7 +161,7 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn);
 }
@@ -178,7 +178,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -196,7 +196,7 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn);
 }
@@ -215,7 +215,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -235,7 +235,7 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn);
 }
@@ -254,7 +254,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -274,7 +274,7 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn);
 }
@@ -293,7 +293,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -313,7 +313,7 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn);
 }
@@ -332,7 +332,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -352,7 +352,7 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn);
 }
@@ -371,7 +371,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -391,7 +391,7 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
    uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn);
 }
@@ -410,7 +410,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -430,7 +430,7 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn);
 }
@@ -449,7 +449,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -469,7 +469,7 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn);
 }
@@ -486,7 +486,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -502,7 +502,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
@@ -520,7 +520,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -538,7 +538,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
@@ -556,7 +556,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -574,7 +574,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
@@ -592,7 +592,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -610,7 +610,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
@@ -626,7 +626,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -642,7 +642,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
@@ -660,7 +660,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -678,7 +678,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
@@ -696,7 +696,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -714,7 +714,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
@@ -732,7 +732,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -750,7 +750,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
@@ -768,7 +768,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -786,7 +786,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
@@ -804,7 +804,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -822,7 +822,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
@@ -840,7 +840,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -858,7 +858,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
@@ -876,7 +876,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -894,7 +894,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 
@@ -910,7 +910,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -928,7 +928,7 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn);
 }
@@ -947,7 +947,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -967,7 +967,7 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn);
 }
@@ -986,7 +986,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1006,7 +1006,7 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn);
 }
@@ -1025,7 +1025,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1045,7 +1045,7 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn);
 }
@@ -1062,7 +1062,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1080,7 +1080,7 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn);
 }
@@ -1099,7 +1099,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1119,7 +1119,7 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn);
 }
@@ -1138,7 +1138,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1158,7 +1158,7 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn);
 }
@@ -1177,7 +1177,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1197,7 +1197,7 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn);
 }
@@ -1216,7 +1216,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1236,7 +1236,7 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn);
 }
@@ -1255,7 +1255,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1275,7 +1275,7 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn);
 }
@@ -1294,7 +1294,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1314,7 +1314,7 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn);
 }
@@ -1333,7 +1333,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1353,7 +1353,7 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn);
 }
@@ -1370,7 +1370,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1386,7 +1386,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
@@ -1404,7 +1404,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1422,7 +1422,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1440,7 +1440,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1458,7 +1458,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1476,7 +1476,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1494,7 +1494,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming {
+void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
@@ -1510,7 +1510,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1526,7 +1526,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
@@ -1544,7 +1544,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1562,7 +1562,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1580,7 +1580,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1598,7 +1598,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1616,7 +1616,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1634,7 +1634,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming {
+void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
@@ -1652,7 +1652,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1670,7 +1670,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1688,7 +1688,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1706,7 +1706,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming {
+void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1724,7 +1724,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1742,7 +1742,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming {
+void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1760,7 +1760,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1778,7 +1778,7 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming {
+void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
   SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
index 750eead7c705e..8b5bd79984c21 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -18,7 +18,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 0)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_mask_za() {
+__arm_new_za void test_svzero_mask_za() {
   svzero_mask_za(0);
 }
 
@@ -34,7 +34,7 @@ void test_svzero_mask_za() {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 176)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_mask_za_1() {
+__arm_new_za void test_svzero_mask_za_1() {
   svzero_mask_za(176);
 }
 
@@ -50,7 +50,7 @@ void test_svzero_mask_za_1() {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_mask_za_2() {
+__arm_new_za void test_svzero_mask_za_2() {
   svzero_mask_za(255);
 }
 
@@ -66,7 +66,7 @@ void test_svzero_mask_za_2() {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_za() {
+__arm_new_za void test_svzero_za() {
   svzero_za();
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

>From 48ee745a815f0fda41cb1791f91d73db73e4aeba Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Dec 2023 13:57:56 +0000
Subject: [PATCH 7/7] fixup: fix rebase issue

---
 clang/lib/Sema/SemaChecking.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index c7de8db15247c..24a0116ffe67c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3058,11 +3058,6 @@ bool Sema::ParseSVEImmChecks(
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck2_4_Mul2:
-      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
-          SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2))
-        HasError = true;
-      break;
     case SVETypeFlags::ImmCheckExtract:
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0,
                                       (2048 / ElementSizeInBits) - 1))
@@ -3134,6 +3129,11 @@ bool Sema::ParseSVEImmChecks(
       if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 0, 255))
         HasError = true;
       break;
+    case SVETypeFlags::ImmCheck2_4_Mul2:
+      if (SemaBuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
+          SemaBuiltinConstantArgMultiple(TheCall, ArgNum, 2))
+        HasError = true;
+      break;
     }
   }