[clang] 8e7f073 - [Clang][AArch64] Change SME attributes for shared/new/preserved state. (#76971)

Mon Jan 15 01:41:38 PST 2024

Author: Sander de Smalen
Date: 2024-01-15T09:41:32Z
New Revision: 8e7f073eb42c92aa7a2b651ca314d7fcebf296e3

URL: https://github.com/llvm/llvm-project/commit/8e7f073eb42c92aa7a2b651ca314d7fcebf296e3
DIFF: https://github.com/llvm/llvm-project/commit/8e7f073eb42c92aa7a2b651ca314d7fcebf296e3.diff

LOG: [Clang][AArch64] Change SME attributes for shared/new/preserved state. (#76971)

This patch replaces the `__arm_new_za`, `__arm_shared_za` and
`__arm_preserves_za` attributes in favour of:
* `__arm_new("za")`
* `__arm_in("za")`
* `__arm_out("za")`
* `__arm_inout("za")`
* `__arm_preserves("za")`

As described in https://github.com/ARM-software/acle/pull/276.

One change is that `__arm_in/out/inout/preserves(S)` are all mutually
exclusive, whereas previously it was fine to write `__arm_shared_za
__arm_preserves_za`. This case is now represented with `__arm_in("za")`.

The current implementation uses the same LLVM attributes under the hood,
since `__arm_in/out/inout` are all variations of "shared ZA", so can use
the existing `aarch64_pstate_za_shared` attribute in LLVM.

#77941 will add support for the new "zt0" state as introduced
with SME2.

Added: 
    

Modified: 
    clang/include/clang/AST/Type.h
    clang/include/clang/Basic/Attr.td
    clang/include/clang/Basic/AttrDocs.td
    clang/include/clang/Basic/AttributeCommonInfo.h
    clang/include/clang/Basic/CMakeLists.txt
    clang/include/clang/Basic/DiagnosticSemaKinds.td
    clang/include/clang/Basic/TokenKinds.def
    clang/include/clang/Basic/TokenKinds.h
    clang/include/clang/Sema/Sema.h
    clang/lib/AST/TypePrinter.cpp
    clang/lib/CodeGen/CGCall.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/lib/Parse/ParseDecl.cpp
    clang/lib/Parse/ParseDeclCXX.cpp
    clang/lib/Parse/ParseTentative.cpp
    clang/lib/Sema/SemaChecking.cpp
    clang/lib/Sema/SemaDecl.cpp
    clang/lib/Sema/SemaDeclAttr.cpp
    clang/lib/Sema/SemaDeclCXX.cpp
    clang/lib/Sema/SemaExpr.cpp
    clang/lib/Sema/SemaOverload.cpp
    clang/lib/Sema/SemaType.cpp
    clang/test/AST/ast-dump-sme-attributes.cpp
    clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
    clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
    clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
    clang/test/Modules/aarch64-sme-keywords.cppm
    clang/test/Parser/c2x-attribute-keywords.c
    clang/test/Parser/c2x-attribute-keywords.m
    clang/test/Parser/cxx0x-keyword-attributes.cpp
    clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
    clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
    clang/test/Sema/aarch64-sme-func-attrs.c
    clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
    clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
    clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
    clang/utils/TableGen/ClangAttrEmitter.cpp
    clang/utils/TableGen/SveEmitter.cpp
    clang/utils/TableGen/TableGen.cpp
    clang/utils/TableGen/TableGenBackends.h

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index d4e5310fb3abc60..a7efe78591635e0 100644

--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4036,12 +4036,27 @@ class FunctionType : public Type {
     SME_NormalFunction = 0,
     SME_PStateSMEnabledMask = 1 << 0,
     SME_PStateSMCompatibleMask = 1 << 1,
-    SME_PStateZASharedMask = 1 << 2,
-    SME_PStateZAPreservedMask = 1 << 3,
-    SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of the
-                                  // bitmask in FunctionTypeExtraBitfields.
+
+    // Describes the value of the state using ArmStateValue.
+    SME_ZAShift = 2,
+    SME_ZAMask = 0b111 << SME_ZAShift,
+
+    SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of
+                                  // the bitmask in FunctionTypeExtraBitfields.
+  };
+
+  enum ArmStateValue : unsigned {
+    ARM_None = 0,
+    ARM_Preserves = 1,
+    ARM_In = 2,
+    ARM_Out = 3,
+    ARM_InOut = 4,
   };
 
+  static ArmStateValue getArmZAState(unsigned AttrBits) {
+    return (ArmStateValue)((AttrBits & SME_ZAMask) >> SME_ZAShift);
+  }
+
   /// A simple holder for various uncommon bits which do not fit in
   /// FunctionTypeBitfields. Aligned to alignof(void *) to maintain the
   /// alignment of subsequent objects in TrailingObjects.

diff  --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index a03b0e44e15f7d9..b9ec720dd9e199c 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2539,16 +2539,45 @@ def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr<TargetAArch64> {
   let Documentation = [ArmSmeStreamingCompatibleDocs];
 }
 
-def ArmSharedZA : TypeAttr, TargetSpecificAttr<TargetAArch64> {
-  let Spellings = [RegularKeyword<"__arm_shared_za">];
+def ArmNew : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_new">];
+  let Args = [VariadicStringArgument<"NewArgs">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [ArmNewDocs];
+
+  let AdditionalMembers = [{
+    bool isNewZA() const {
+      return llvm::is_contained(newArgs(), "za");
+    }
+  }];
+}
+
+def ArmIn : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_in">];
+  let Args = [VariadicStringArgument<"InArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmInDocs];
+}
+
+def ArmOut : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_out">];
+  let Args = [VariadicStringArgument<"OutArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmOutDocs];
+}
+
+def ArmInOut : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_inout">];
+  let Args = [VariadicStringArgument<"InOutArgs">];
   let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
-  let Documentation = [ArmSmeSharedZADocs];
+  let Documentation = [ArmInOutDocs];
 }
 
-def ArmPreservesZA : TypeAttr, TargetSpecificAttr<TargetAArch64> {
-  let Spellings = [RegularKeyword<"__arm_preserves_za">];
+def ArmPreserves : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_preserves">];
+  let Args = [VariadicStringArgument<"PreserveArgs">];
   let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
-  let Documentation = [ArmSmePreservesZADocs];
+  let Documentation = [ArmPreservesDocs];
 }
 
 def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
@@ -2557,14 +2586,6 @@ def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
   let Documentation = [ArmSmeLocallyStreamingDocs];
 }
 
-def ArmNewZA : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
-  let Spellings = [RegularKeyword<"__arm_new_za">];
-  let Subjects = SubjectList<[Function], ErrorDiag>;
-  let Documentation = [ArmSmeNewZADocs];
-}
-def : MutualExclusions<[ArmNewZA, ArmSharedZA]>;
-def : MutualExclusions<[ArmNewZA, ArmPreservesZA]>;
-
 
 def Pure : InheritableAttr {
   let Spellings = [GCC<"pure">];

diff  --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2e8d7752c9751e9..9e8190614fbe8ab 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -6861,30 +6861,73 @@ without changing modes.
   }];
 }
 
-def ArmSmeSharedZADocs : Documentation {
+def ArmInDocs : Documentation {
   let Category = DocCatArmSmeAttributes;
   let Content = [{
-The ``__arm_shared_za`` keyword applies to prototyped function types and specifies
-that the function shares SME's matrix storage (ZA) with its caller.  This
-means that:
+The ``__arm_in`` keyword applies to prototyped function types and specifies
+that the function shares a given state S with its caller.  For ``__arm_in``, the
+function takes the state S as input and returns with the state S unchanged.
 
-* the function requires that the processor implements the Scalable Matrix
-  Extension (SME).
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
 
-* the function enters with ZA in an active state.
+* ``"za"`` for Matrix Storage (requires SME)
 
-* the function returns with ZA in an active state.
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
   }];
 }
 
-def ArmSmePreservesZADocs : Documentation {
+def ArmOutDocs : Documentation {
   let Category = DocCatArmSmeAttributes;
   let Content = [{
-The ``__arm_preserves_za`` keyword applies to prototyped function types and
-specifies that the function does not modify ZA state.
+The ``__arm_out`` keyword applies to prototyped function types and specifies
+that the function shares a given state S with its caller.  For ``__arm_out``,
+the function ignores the incoming state for S and returns new state for S.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
   }];
 }
 
+def ArmInOutDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_inout`` keyword applies to prototyped function types and specifies
+that the function shares a given state S with its caller.  For ``__arm_inout``,
+the function takes the state S as input and returns new state for S.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
+  }];
+}
+
+def ArmPreservesDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_preserves`` keyword applies to prototyped function types and
+specifies that the function does not read a given state S and returns
+with state S unchanged.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
+  }];
+}
 
 def ArmSmeLocallyStreamingDocs : Documentation {
   let Category = DocCatArmSmeAttributes;
@@ -6907,13 +6950,18 @@ at the end of the function.
   }];
 }
 
-def ArmSmeNewZADocs : Documentation {
+def ArmNewDocs : Documentation {
   let Category = DocCatArmSmeAttributes;
   let Content = [{
-The ``__arm_new_za`` keyword applies to function declarations and specifies
-that the function will be set up with a fresh ZA context.
+The ``__arm_new`` keyword applies to function declarations and specifies
+that the function will create a new scope for state S.
+
+The attribute takes string arguments to instruct the compiler for which state
+to create new scope.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
 
-This means that:
+For state ``"za"``, this means that:
 
 * the function requires that the target processor implements the Scalable Matrix
   Extension (SME).
@@ -6924,8 +6972,8 @@ This means that:
 
 * the function will disable PSTATE.ZA (by setting it to 0) before returning.
 
-For ``__arm_new_za`` functions Clang will set up the ZA context automatically
-on entry to the function, and disable it before returning. For example, if ZA is
+For ``__arm_new("za")`` functions Clang will set up the ZA context automatically
+on entry to the function and disable it before returning. For example, if ZA is
 in a dormant state Clang will generate the code to commit a lazy-save and set up
 a new ZA state before executing user code.
   }];

diff  --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index 018b92fdc11f559..d787e4959bfee35 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -255,6 +255,19 @@ class AttributeCommonInfo {
     return SpellingIndex != SpellingNotCalculated;
   }
 };
+
+inline bool doesKeywordAttributeTakeArgs(tok::TokenKind Kind) {
+  switch (Kind) {
+  default:
+    return false;
+#define KEYWORD_ATTRIBUTE(NAME, HASARG)                                        \
+  case tok::kw_##NAME:                                                         \
+    return HASARG;
+#include "clang/Basic/RegularKeywordAttrInfo.inc"
+#undef KEYWORD_ATTRIBUTE
+  }
+}
+
 } // namespace clang
 
 #endif // LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H

diff  --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 28baa2e45e423e9..be216150c71bbb8 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -45,10 +45,10 @@ clang_tablegen(AttrSubMatchRulesList.inc -gen-clang-attr-subject-match-rule-list
   SOURCE Attr.td
   TARGET ClangAttrSubjectMatchRuleList)
 
-clang_tablegen(AttrTokenKinds.inc -gen-clang-attr-token-kinds
+clang_tablegen(RegularKeywordAttrInfo.inc -gen-clang-regular-keyword-attr-info
   -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
   SOURCE Attr.td
-  TARGET ClangAttrTokenKinds
+  TARGET ClangRegularKeywordAttrInfo
   )
 
 clang_tablegen(AttrHasAttributeImpl.inc -gen-clang-attr-has-attribute-impl

diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1a79892e40030ae..c50b188a1039acb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3700,6 +3700,12 @@ def err_sme_definition_using_sm_in_non_sme_target : Error<
   "function executed in streaming-SVE mode requires 'sme'">;
 def err_sme_definition_using_za_in_non_sme_target : Error<
   "function using ZA state requires 'sme'">;
+def err_conflicting_attributes_arm_state : Error<
+  "conflicting attributes for state '%0'">;
+def err_unknown_arm_state : Error<
+  "unknown state '%0'">;
+def err_missing_arm_state : Error<
+  "missing state for %0">;
 def err_cconv_change : Error<
   "function declared '%0' here was previously declared "
   "%select{'%2'|without calling convention}1">;

diff  --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 3f0e1e1a7d45ad2..d15e4970b7d8f17 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -761,9 +761,9 @@ KEYWORD(__builtin_sycl_unique_stable_name, KEYSYCL)
 
 // Keywords defined by Attr.td.
 #ifndef KEYWORD_ATTRIBUTE
-#define KEYWORD_ATTRIBUTE(X) KEYWORD(X, KEYALL)
+#define KEYWORD_ATTRIBUTE(X, ...) KEYWORD(X, KEYALL)
 #endif
-#include "clang/Basic/AttrTokenKinds.inc"
+#include "clang/Basic/RegularKeywordAttrInfo.inc"
 
 // Clang-specific keywords enabled only in testing.
 TESTING_KEYWORD(__unknown_anytype , KEYALL)

diff  --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index e4857405bc7f462..7529b922619ada3 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -109,8 +109,8 @@ bool isPragmaAnnotation(TokenKind K);
 
 inline constexpr bool isRegularKeywordAttribute(TokenKind K) {
   return (false
-#define KEYWORD_ATTRIBUTE(X) || (K == tok::kw_##X)
-#include "clang/Basic/AttrTokenKinds.inc"
+#define KEYWORD_ATTRIBUTE(X, ...) || (K == tok::kw_##X)
+#include "clang/Basic/RegularKeywordAttrInfo.inc"
   );
 }
 

diff  --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index cf2d4fbe6d3ba10..4ef1fe542ea54f0 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7110,15 +7110,7 @@ class Sema final {
                                  NestedNameSpecInfo &IdInfo,
                                  bool EnteringContext);
 
-  /// The kind of conversion to check for. Either all attributes must match exactly,
-  /// or the converted type may add/drop '__arm_preserves_za'.
-  enum class AArch64SMECallConversionKind {
-    MatchExactly,
-    MayAddPreservesZA,
-    MayDropPreservesZA,
-  };
-  bool IsInvalidSMECallConversion(QualType FromType, QualType ToType,
-                                  AArch64SMECallConversionKind C);
+  bool IsInvalidSMECallConversion(QualType FromType, QualType ToType);
 
   /// The parser has parsed a nested-name-specifier
   /// 'template[opt] template-name < template-args >::'.

diff  --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index f69412429273674..1baf895ebaec2cd 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -937,15 +937,20 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
   OS << ')';
 
   FunctionType::ExtInfo Info = T->getExtInfo();
+  unsigned SMEBits = T->getAArch64SMEAttributes();
 
-  if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask))
+  if (SMEBits & FunctionType::SME_PStateSMCompatibleMask)
     OS << " __arm_streaming_compatible";
-  if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask))
+  if (SMEBits & FunctionType::SME_PStateSMEnabledMask)
     OS << " __arm_streaming";
-  if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask))
-    OS << " __arm_shared_za";
-  if ((T->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask))
-    OS << " __arm_preserves_za";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves)
+    OS << " __arm_preserves(\"za\")";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In)
+    OS << " __arm_in(\"za\")";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out)
+    OS << " __arm_out(\"za\")";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut)
+    OS << " __arm_inout(\"za\")";
 
   printFunctionAfter(Info, OS);
 
@@ -1788,14 +1793,6 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
     OS << "__arm_streaming_compatible";
     return;
   }
-  if (T->getAttrKind() == attr::ArmSharedZA) {
-    OS << "__arm_shared_za";
-    return;
-  }
-  if (T->getAttrKind() == attr::ArmPreservesZA) {
-    OS << "__arm_preserves_za";
-    return;
-  }
 
   OS << " __attribute__((";
   switch (T->getAttrKind()) {
@@ -1839,8 +1836,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::WebAssemblyFuncref:
   case attr::ArmStreaming:
   case attr::ArmStreamingCompatible:
-  case attr::ArmSharedZA:
-  case attr::ArmPreservesZA:
+  case attr::ArmIn:
+  case attr::ArmOut:
+  case attr::ArmInOut:
+  case attr::ArmPreserves:
     llvm_unreachable("This attribute should have been handled already");
 
   case attr::NSReturnsRetained:

diff  --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 13677cf150aed2f..acf6cbad1c74809 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1767,14 +1767,21 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx,
       FPT->isNothrow())
     FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
 
-  if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)
+  unsigned SMEBits = FPT->getAArch64SMEAttributes();
+  if (SMEBits & FunctionType::SME_PStateSMEnabledMask)
     FuncAttrs.addAttribute("aarch64_pstate_sm_enabled");
-  if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask)
+  if (SMEBits & FunctionType::SME_PStateSMCompatibleMask)
     FuncAttrs.addAttribute("aarch64_pstate_sm_compatible");
-  if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask)
+
+  // ZA
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out ||
+      FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut)
+    FuncAttrs.addAttribute("aarch64_pstate_za_shared");
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves ||
+      FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In) {
     FuncAttrs.addAttribute("aarch64_pstate_za_shared");
-  if (FPT->getAArch64SMEAttributes() & FunctionType::SME_PStateZAPreservedMask)
     FuncAttrs.addAttribute("aarch64_pstate_za_preserved");
+  }
 }
 
 static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs,
@@ -2446,9 +2453,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
 
     if (TargetDecl->hasAttr<ArmLocallyStreamingAttr>())
       FuncAttrs.addAttribute("aarch64_pstate_sm_body");
-
-    if (TargetDecl->hasAttr<ArmNewZAAttr>())
-      FuncAttrs.addAttribute("aarch64_pstate_za_new");
   }
 
   // Attach "no-builtins" attributes to:

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 0cfe7a0133b7e32..01b042ce5dd1343 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2380,8 +2380,10 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   if (D->hasAttr<ArmLocallyStreamingAttr>())
     B.addAttribute("aarch64_pstate_sm_body");
 
-  if (D->hasAttr<ArmNewZAAttr>())
-    B.addAttribute("aarch64_pstate_za_new");
+  if (auto *Attr = D->getAttr<ArmNewAttr>()) {
+    if (Attr->isNewZA())
+      B.addAttribute("aarch64_pstate_za_new");
+  }
 
   // Track whether we need to add the optnone LLVM attribute,
   // starting with the default for this optimization level.

diff  --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index ed684c5d57b1ee0..8d856cc2cf8313d 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -6791,7 +6791,13 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
     } else if (Tok.isRegularKeywordAttribute()) {
       // For consistency with attribute parsing.
       Diag(Tok, diag::err_keyword_not_allowed) << Tok.getIdentifierInfo();
+      bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind());
       ConsumeToken();
+      if (TakesArgs) {
+        BalancedDelimiterTracker T(*this, tok::l_paren);
+        if (!T.consumeOpen())
+          T.skipToEnd();
+      }
     } else if (Tok.is(tok::kw_requires) && D.hasGroupingParens()) {
       // This declarator is declaring a function, but the requires clause is
       // in the wrong place:

diff  --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index d97081da4200de2..5576be9e717a9b5 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1890,7 +1890,13 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
         if (!SkipUntil(tok::r_paren, StopAtSemi))
           break;
       } else if (Tok.isRegularKeywordAttribute()) {
+        bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind());
         ConsumeToken();
+        if (TakesArgs) {
+          BalancedDelimiterTracker T(*this, tok::l_paren);
+          if (!T.consumeOpen())
+            T.skipToEnd();
+        }
       } else {
         break;
       }
@@ -4539,8 +4545,18 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs,
   if (Tok.isRegularKeywordAttribute()) {
     SourceLocation Loc = Tok.getLocation();
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
-    Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Tok.getKind());
+    ParsedAttr::Form Form = ParsedAttr::Form(Tok.getKind());
+    bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind());
     ConsumeToken();
+    if (TakesArgs) {
+      if (!Tok.is(tok::l_paren))
+        Diag(Tok.getLocation(), diag::err_expected_lparen_after) << AttrName;
+      else
+        ParseAttributeArgsCommon(AttrName, Loc, Attrs, EndLoc,
+                                 /*ScopeName*/ nullptr,
+                                 /*ScopeLoc*/ Loc, Form);
+    } else
+      Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Form);
     return;
   }
 
@@ -4706,11 +4722,13 @@ SourceLocation Parser::SkipCXX11Attributes() {
       T.consumeOpen();
       T.skipToEnd();
       EndLoc = T.getCloseLocation();
-    } else if (Tok.isRegularKeywordAttribute()) {
+    } else if (Tok.isRegularKeywordAttribute() &&
+               !doesKeywordAttributeTakeArgs(Tok.getKind())) {
       EndLoc = Tok.getLocation();
       ConsumeToken();
     } else {
-      assert(Tok.is(tok::kw_alignas) && "not an attribute specifier");
+      assert((Tok.is(tok::kw_alignas) || Tok.isRegularKeywordAttribute()) &&
+             "not an attribute specifier");
       ConsumeToken();
       BalancedDelimiterTracker T(*this, tok::l_paren);
       if (!T.consumeOpen())

diff  --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 242741c15b5ffac..5bfabf55f50c445 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -894,7 +894,8 @@ bool Parser::TrySkipAttributes() {
       // Note that explicitly checking for `[[` and `]]` allows to fail as
       // expected in the case of the Objective-C message send syntax.
       ConsumeBracket();
-    } else if (Tok.isRegularKeywordAttribute()) {
+    } else if (Tok.isRegularKeywordAttribute() &&
+               !doesKeywordAttributeTakeArgs(Tok.getKind())) {
       ConsumeToken();
     } else {
       ConsumeToken();

diff  --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 74f8f626fb1637c..ace3e386988f005 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3190,11 +3190,15 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
 }
 
 static bool hasSMEZAState(const FunctionDecl *FD) {
-  if (FD->hasAttr<ArmNewZAAttr>())
-    return true;
-  if (const auto *T = FD->getType()->getAs<FunctionProtoType>())
-    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateZASharedMask)
+  if (auto *Attr = FD->getAttr<ArmNewAttr>())
+    if (Attr->isNewZA())
+      return true;
+  if (const auto *T = FD->getType()->getAs<FunctionProtoType>()) {
+    FunctionType::ArmStateValue State =
+        FunctionType::getArmZAState(T->getAArch64SMEAttributes());
+    if (State != FunctionType::ARM_None)
       return true;
+  }
   return false;
 }
 
@@ -7522,14 +7526,19 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
 
     // If the callee uses AArch64 SME ZA state but the caller doesn't define
     // any, then this is an error.
-    if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateZASharedMask) {
+    FunctionType::ArmStateValue ArmZAState =
+        FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes);
+    if (ArmZAState != FunctionType::ARM_None) {
       bool CallerHasZAState = false;
       if (const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
-        if (CallerFD->hasAttr<ArmNewZAAttr>())
+        auto *Attr = CallerFD->getAttr<ArmNewAttr>();
+        if (Attr && Attr->isNewZA())
           CallerHasZAState = true;
-        else if (const auto *FPT = CallerFD->getType()->getAs<FunctionProtoType>())
-          CallerHasZAState = FPT->getExtProtoInfo().AArch64SMEAttributes &
-                             FunctionType::SME_PStateZASharedMask;
+        else if (const auto *FPT =
+                     CallerFD->getType()->getAs<FunctionProtoType>())
+          CallerHasZAState = FunctionType::getArmZAState(
+                                 FPT->getExtProtoInfo().AArch64SMEAttributes) !=
+                             FunctionType::ARM_None;
       }
 
       if (!CallerHasZAState)

diff  --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index e92fd104d78eb57..4e7049571eeb7a3 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3819,8 +3819,7 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
 
   // It is not permitted to redeclare an SME function with 
diff erent SME
   // attributes.
-  if (IsInvalidSMECallConversion(Old->getType(), New->getType(),
-                                 AArch64SMECallConversionKind::MatchExactly)) {
+  if (IsInvalidSMECallConversion(Old->getType(), New->getType())) {
     Diag(New->getLocation(), diag::err_sme_attr_mismatch)
         << New->getType() << Old->getType();
     Diag(OldLocation, diag::note_previous_declaration);
@@ -12180,13 +12179,15 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
   // Check if the function definition uses any AArch64 SME features without
   // having the '+sme' feature enabled.
   if (DeclIsDefn) {
+    const auto *Attr = NewFD->getAttr<ArmNewAttr>();
     bool UsesSM = NewFD->hasAttr<ArmLocallyStreamingAttr>();
-    bool UsesZA = NewFD->hasAttr<ArmNewZAAttr>();
+    bool UsesZA = Attr && Attr->isNewZA();
     if (const auto *FPT = NewFD->getType()->getAs<FunctionProtoType>()) {
       FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
       UsesSM |=
           EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
-      UsesZA |= EPI.AArch64SMEAttributes & FunctionType::SME_PStateZASharedMask;
+      UsesZA |= FunctionType::getArmZAState(EPI.AArch64SMEAttributes) !=
+                FunctionType::ARM_None;
     }
 
     if (UsesSM || UsesZA) {

diff  --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1a58cfd8e4179e7..7e6881049d8d953 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8951,26 +8951,74 @@ static bool MustDelayAttributeArguments(const ParsedAttr &AL) {
   return false;
 }
 
+static bool checkArmNewAttrMutualExclusion(
+    Sema &S, const ParsedAttr &AL, const FunctionProtoType *FPT,
+    FunctionType::ArmStateValue CurrentState, StringRef StateName) {
+  auto CheckForIncompatibleAttr =
+      [&](FunctionType::ArmStateValue IncompatibleState,
+          StringRef IncompatibleStateName) {
+        if (CurrentState == IncompatibleState) {
+          S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible)
+              << (std::string("'__arm_new(\"") + StateName.str() + "\")'")
+              << (std::string("'") + IncompatibleStateName.str() + "(\"" +
+                  StateName.str() + "\")'")
+              << true;
+          AL.setInvalid();
+        }
+      };
 
-static void handleArmNewZaAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (auto *FPT = dyn_cast<FunctionProtoType>(D->getFunctionType())) {
-    if (FPT->getAArch64SMEAttributes() &
-        FunctionType::SME_PStateZASharedMask) {
-      S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible)
-          << AL << "'__arm_shared_za'" << true;
+  CheckForIncompatibleAttr(FunctionType::ARM_In, "__arm_in");
+  CheckForIncompatibleAttr(FunctionType::ARM_Out, "__arm_out");
+  CheckForIncompatibleAttr(FunctionType::ARM_InOut, "__arm_inout");
+  CheckForIncompatibleAttr(FunctionType::ARM_Preserves, "__arm_preserves");
+  return AL.isInvalid();
+}
+
+static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (!AL.getNumArgs()) {
+    S.Diag(AL.getLoc(), diag::err_missing_arm_state) << AL;
+    AL.setInvalid();
+    return;
+  }
+
+  std::vector<StringRef> NewState;
+  if (const auto *ExistingAttr = D->getAttr<ArmNewAttr>()) {
+    for (StringRef S : ExistingAttr->newArgs())
+      NewState.push_back(S);
+  }
+
+  bool HasZA = false;
+  for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) {
+    StringRef StateName;
+    SourceLocation LiteralLoc;
+    if (!S.checkStringLiteralArgumentAttr(AL, I, StateName, &LiteralLoc))
+      return;
+
+    if (StateName == "za")
+      HasZA = true;
+    else {
+      S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
       AL.setInvalid();
+      return;
     }
-    if (FPT->getAArch64SMEAttributes() &
-        FunctionType::SME_PStateZAPreservedMask) {
-      S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible)
-          << AL << "'__arm_preserves_za'" << true;
-      AL.setInvalid();
+
+    if (std::find(NewState.begin(), NewState.end(), StateName) ==
+        NewState.end()) { // Avoid adding duplicates.
+      NewState.push_back(StateName);
     }
-    if (AL.isInvalid())
+  }
+
+  if (auto *FPT = dyn_cast<FunctionProtoType>(D->getFunctionType())) {
+    FunctionType::ArmStateValue ZAState =
+        FunctionType::getArmZAState(FPT->getAArch64SMEAttributes());
+    if (HasZA && ZAState != FunctionType::ARM_None &&
+        checkArmNewAttrMutualExclusion(S, AL, FPT, ZAState, "za"))
       return;
   }
 
-  handleSimpleAttribute<ArmNewZAAttr>(S, D, AL);
+  D->dropAttr<ArmNewAttr>();
+  D->addAttr(::new (S.Context)
+                 ArmNewAttr(S.Context, AL, NewState.data(), NewState.size()));
 }
 
 /// ProcessDeclAttribute - Apply the specific attribute to the specified decl if
@@ -9752,8 +9800,8 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
     handleSimpleAttribute<ArmLocallyStreamingAttr>(S, D, AL);
     break;
 
-  case ParsedAttr::AT_ArmNewZA:
-    handleArmNewZaAttr(S, D, AL);
+  case ParsedAttr::AT_ArmNew:
+    handleArmNewAttr(S, D, AL);
     break;
 
   case ParsedAttr::AT_AcquireHandle:

diff  --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 36e53c684ac4dc3..f229e734d06b283 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -18328,9 +18328,7 @@ bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
   }
 
   // SME attributes must match when overriding a function declaration.
-  if (IsInvalidSMECallConversion(
-          Old->getType(), New->getType(),
-          AArch64SMECallConversionKind::MayAddPreservesZA)) {
+  if (IsInvalidSMECallConversion(Old->getType(), New->getType())) {
     Diag(New->getLocation(), diag::err_conflicting_overriding_attributes)
         << New << New->getType() << Old->getType();
     Diag(Old->getLocation(), diag::note_overridden_virtual_function);

diff  --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 2f48ea237cdfa42..049fdae09bb185c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9803,8 +9803,7 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc,
 }
 
 // Check that the SME attributes for PSTATE.ZA and PSTATE.SM are compatible.
-bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType,
-                                      AArch64SMECallConversionKind C) {
+bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType) {
   unsigned FromAttributes = 0, ToAttributes = 0;
   if (const auto *FromFn =
           dyn_cast<FunctionProtoType>(Context.getCanonicalType(FromType)))
@@ -9815,25 +9814,7 @@ bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType,
     ToAttributes =
         ToFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask;
 
-  if (FromAttributes == ToAttributes)
-    return false;
-
-  // If the '__arm_preserves_za' is the only 
diff erence between the types,
-  // check whether we're allowed to add or remove it.
-  if ((FromAttributes ^ ToAttributes) ==
-      FunctionType::SME_PStateZAPreservedMask) {
-    switch (C) {
-    case AArch64SMECallConversionKind::MatchExactly:
-      return true;
-    case AArch64SMECallConversionKind::MayAddPreservesZA:
-      return !(ToAttributes & FunctionType::SME_PStateZAPreservedMask);
-    case AArch64SMECallConversionKind::MayDropPreservesZA:
-      return !(FromAttributes & FunctionType::SME_PStateZAPreservedMask);
-    }
-  }
-
-  // There has been a mismatch of attributes
-  return true;
+  return FromAttributes != ToAttributes;
 }
 
 // Check if we have a conversion between incompatible cmse function pointer
@@ -10002,9 +9983,7 @@ checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType,
     return Sema::IncompatibleFunctionPointer;
   if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans))
     return Sema::IncompatibleFunctionPointer;
-  if (S.IsInvalidSMECallConversion(
-          rtrans, ltrans,
-          Sema::AArch64SMECallConversionKind::MayDropPreservesZA))
+  if (S.IsInvalidSMECallConversion(rtrans, ltrans))
     return Sema::IncompatibleFunctionPointer;
   return ConvTy;
 }

diff  --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 23b9bc0fe2d6e2e..31111222e043787 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1784,26 +1784,6 @@ bool Sema::IsFunctionConversion(QualType FromType, QualType ToType,
     Changed = true;
   }
 
-  // Drop the 'arm_preserves_za' if not present in the target type (we can do
-  // that because it is merely a hint).
-  if (const auto *FromFPT = dyn_cast<FunctionProtoType>(FromFn)) {
-    FunctionProtoType::ExtProtoInfo ExtInfo = FromFPT->getExtProtoInfo();
-    if (ExtInfo.AArch64SMEAttributes &
-        FunctionType::SME_PStateZAPreservedMask) {
-      unsigned ToFlags = 0;
-      if (const auto *ToFPT = dyn_cast<FunctionProtoType>(ToFn))
-        ToFlags = ToFPT->getExtProtoInfo().AArch64SMEAttributes;
-      if (!(ToFlags & FunctionType::SME_PStateZAPreservedMask)) {
-        ExtInfo.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask,
-                                   false);
-        QualType QT = Context.getFunctionType(
-            FromFPT->getReturnType(), FromFPT->getParamTypes(), ExtInfo);
-        FromFn = QT->getAs<FunctionType>();
-        Changed = true;
-      }
-    }
-  }
-
   // Drop 'noexcept' if not present in target type.
   if (const auto *FromFPT = dyn_cast<FunctionProtoType>(FromFn)) {
     const auto *ToFPT = cast<FunctionProtoType>(ToFn);

diff  --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a376f20fa4f4e08..78702b41ab82005 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -147,8 +147,10 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_CmseNSCall:                                              \
   case ParsedAttr::AT_ArmStreaming:                                            \
   case ParsedAttr::AT_ArmStreamingCompatible:                                  \
-  case ParsedAttr::AT_ArmSharedZA:                                             \
-  case ParsedAttr::AT_ArmPreservesZA:                                          \
+  case ParsedAttr::AT_ArmPreserves:                                            \
+  case ParsedAttr::AT_ArmIn:                                                   \
+  case ParsedAttr::AT_ArmOut:                                                  \
+  case ParsedAttr::AT_ArmInOut:                                                \
   case ParsedAttr::AT_AnyX86NoCallerSavedRegisters:                            \
   case ParsedAttr::AT_AnyX86NoCfCheck:                                         \
     CALLING_CONV_ATTRS_CASELIST
@@ -7876,6 +7878,49 @@ static bool checkMutualExclusion(TypeProcessingState &state,
   return true;
 }
 
+static bool handleArmStateAttribute(Sema &S,
+                                    FunctionProtoType::ExtProtoInfo &EPI,
+                                    ParsedAttr &Attr,
+                                    FunctionType::ArmStateValue State) {
+  if (!Attr.getNumArgs()) {
+    S.Diag(Attr.getLoc(), diag::err_missing_arm_state) << Attr;
+    Attr.setInvalid();
+    return true;
+  }
+
+  for (unsigned I = 0; I < Attr.getNumArgs(); ++I) {
+    StringRef StateName;
+    SourceLocation LiteralLoc;
+    if (!S.checkStringLiteralArgumentAttr(Attr, I, StateName, &LiteralLoc))
+      return true;
+
+    unsigned Shift;
+    FunctionType::ArmStateValue ExistingState;
+    if (StateName == "za") {
+      Shift = FunctionType::SME_ZAShift;
+      ExistingState = FunctionType::getArmZAState(EPI.AArch64SMEAttributes);
+    } else {
+      S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
+      Attr.setInvalid();
+      return true;
+    }
+
+    // __arm_in(S), __arm_out(S), __arm_inout(S) and __arm_preserves(S)
+    // are all mutually exclusive for the same S, so check if there are
+    // conflicting attributes.
+    if (ExistingState != FunctionType::ARM_None && ExistingState != State) {
+      S.Diag(LiteralLoc, diag::err_conflicting_attributes_arm_state)
+          << StateName;
+      Attr.setInvalid();
+      return true;
+    }
+
+    EPI.setArmSMEAttribute(
+        (FunctionType::AArch64SMETypeAttributes)((State << Shift)));
+  }
+  return false;
+}
+
 /// Process an individual function attribute.  Returns true to
 /// indicate that the attribute was handled, false if it wasn't.
 static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
@@ -8008,11 +8053,18 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
 
   if (attr.getKind() == ParsedAttr::AT_ArmStreaming ||
       attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible ||
-      attr.getKind() == ParsedAttr::AT_ArmSharedZA ||
-      attr.getKind() == ParsedAttr::AT_ArmPreservesZA){
-    if (S.CheckAttrTarget(attr) || S.CheckAttrNoArgs(attr))
+      attr.getKind() == ParsedAttr::AT_ArmPreserves ||
+      attr.getKind() == ParsedAttr::AT_ArmIn ||
+      attr.getKind() == ParsedAttr::AT_ArmOut ||
+      attr.getKind() == ParsedAttr::AT_ArmInOut) {
+    if (S.CheckAttrTarget(attr))
       return true;
 
+    if (attr.getKind() == ParsedAttr::AT_ArmStreaming ||
+        attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible)
+      if (S.CheckAttrNoArgs(attr))
+        return true;
+
     if (!unwrapped.isFunctionType())
       return false;
 
@@ -8039,11 +8091,21 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
         return true;
       EPI.setArmSMEAttribute(FunctionType::SME_PStateSMCompatibleMask);
       break;
-    case ParsedAttr::AT_ArmSharedZA:
-      EPI.setArmSMEAttribute(FunctionType::SME_PStateZASharedMask);
+    case ParsedAttr::AT_ArmPreserves:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_Preserves))
+        return true;
+      break;
+    case ParsedAttr::AT_ArmIn:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_In))
+        return true;
       break;
-    case ParsedAttr::AT_ArmPreservesZA:
-      EPI.setArmSMEAttribute(FunctionType::SME_PStateZAPreservedMask);
+    case ParsedAttr::AT_ArmOut:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_Out))
+        return true;
+      break;
+    case ParsedAttr::AT_ArmInOut:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_InOut))
+        return true;
       break;
     default:
       llvm_unreachable("Unsupported attribute");

diff  --git a/clang/test/AST/ast-dump-sme-attributes.cpp b/clang/test/AST/ast-dump-sme-attributes.cpp
index 6581fd4baba9e4f..133648d90a15764 100644
--- a/clang/test/AST/ast-dump-sme-attributes.cpp
+++ b/clang/test/AST/ast-dump-sme-attributes.cpp
@@ -13,16 +13,16 @@ struct Foo {
 // CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming_compatible 'void () __arm_streaming_compatible'
 // CHECK-NEXT: |-CXXMethodDecl {{.*}} f_locally_streaming 'void ()'
 // CHECK-NEXT: | `-ArmLocallyStreamingAttr
-// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __arm_shared_za'
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __arm_inout("za")'
 // CHECK-NEXT: |-CXXMethodDecl {{.*}} f_new_za 'void ()'
-// CHECK-NEXT: | `-ArmNewZAAttr
-// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __arm_preserves_za'
+// CHECK-NEXT: | `-ArmNewAttr {{.*}} za
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __arm_preserves("za")'
   void f_streaming() __arm_streaming;
   void f_streaming_compatible() __arm_streaming_compatible;
   __arm_locally_streaming void f_locally_streaming();
-  void f_shared_za() __arm_shared_za;
-  __arm_new_za void f_new_za();
-  void f_preserves_za() __arm_preserves_za;
+  void f_shared_za() __arm_inout("za");
+  __arm_new("za") void f_new_za();
+  void f_preserves_za() __arm_preserves("za");
 
 
 // CHECK:      |-CXXMethodDecl {{.*}} test_lambda 'int (int)' implicit-inline

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
index 0768bfc3323870c..f69703a8a7d89be 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -12,8 +12,8 @@ extern int normal_callee();
 
 int streaming_decl(void) __arm_streaming;
 int streaming_compatible_decl(void) __arm_streaming_compatible;
-int shared_za_decl(void) __arm_shared_za;
-int preserves_za_decl(void) __arm_preserves_za;
+int shared_za_decl(void) __arm_inout("za");
+int preserves_za_decl(void) __arm_preserves("za");
 int private_za_decl(void);
 
 // == FUNCTION DEFINITIONS ==
@@ -78,7 +78,7 @@ __arm_locally_streaming int locally_streaming_callee() {
 // CHECK-SAME: #[[ZA_SHARED:[0-9]+]]
 // CHECK: call i32 @normal_callee()
 //
- int shared_za_caller() __arm_shared_za {
+ int shared_za_caller() __arm_inout("za") {
   return normal_callee();
 }
 
@@ -86,7 +86,7 @@ __arm_locally_streaming int locally_streaming_callee() {
 // CHECK-SAME: #[[ZA_SHARED]]
 // CHECK: call i32 @shared_za_decl() #[[ZA_SHARED_CALL:[0-9]+]]
 //
- int shared_za_callee() __arm_shared_za {
+ int shared_za_callee() __arm_inout("za") {
   return shared_za_decl();
 }
 
@@ -97,7 +97,7 @@ __arm_locally_streaming int locally_streaming_callee() {
 // CHECK-SAME: #[[ZA_PRESERVED:[0-9]+]]
 // CHECK: call i32 @normal_callee()
 //
- int preserves_za_caller() __arm_preserves_za {
+ int preserves_za_caller() __arm_preserves("za") {
   return normal_callee();
 }
 
@@ -105,7 +105,7 @@ __arm_locally_streaming int locally_streaming_callee() {
 // CHECK-SAME: #[[ZA_PRESERVED]]
 // CHECK: call i32 @preserves_za_decl() #[[ZA_PRESERVED_CALL:[0-9]+]]
 //
- int preserves_za_callee() __arm_preserves_za {
+ int preserves_za_callee() __arm_preserves("za") {
   return preserves_za_decl();
 }
 
@@ -116,7 +116,7 @@ __arm_locally_streaming int locally_streaming_callee() {
 // CHECK-SAME: #[[ZA_NEW:[0-9]+]]
 // CHECK: call i32 @normal_callee()
 //
-__arm_new_za int new_za_caller() {
+__arm_new("za") int new_za_caller() {
   return normal_callee();
 }
 
@@ -124,7 +124,7 @@ __arm_new_za int new_za_caller() {
 // CHECK-SAME: #[[ZA_NEW]]
 // CHECK: call i32 @private_za_decl()
 //
-__arm_new_za int new_za_callee() {
+__arm_new("za") int new_za_callee() {
   return private_za_decl();
 }
 
@@ -135,8 +135,8 @@ __arm_new_za int new_za_callee() {
 // and also to callsites.
 typedef void (*s_ptrty) (int, int) __arm_streaming;
 typedef void (*sc_ptrty) (int, int) __arm_streaming_compatible;
-typedef void (*sz_ptrty) (int, int) __arm_shared_za;
-typedef void (*pz_ptrty) (int, int) __arm_preserves_za;
+typedef void (*sz_ptrty) (int, int) __arm_inout("za");
+typedef void (*pz_ptrty) (int, int) __arm_preserves("za");
 
 // CHECK-LABEL: @test_streaming_ptrty(
 // CHECK-SAME: #[[NORMAL_DEF:[0-9]+]]
@@ -152,12 +152,12 @@ void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y);
 // CHECK-SAME: #[[ZA_SHARED]]
 // CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_SHARED_CALL]]
 //
-void  test_shared_za(sz_ptrty f, int x, int y) __arm_shared_za { return f(x, y); }
+void  test_shared_za(sz_ptrty f, int x, int y) __arm_inout("za") { return f(x, y); }
 // CHECK-LABEL: @test_preserved_za(
 // CHECK-SAME: #[[ZA_SHARED]]
 // CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_PRESERVED_CALL]]
 //
-void  test_preserved_za(pz_ptrty f, int x, int y) __arm_shared_za { return f(x, y); }
+void  test_preserved_za(pz_ptrty f, int x, int y) __arm_inout("za") { return f(x, y); }
 
 // CHECK-LABEL: @test_indirect_streaming_ptrty(
 // CHECK-SAME: #[[NORMAL_DEF:[0-9]+]]
@@ -255,7 +255,7 @@ int call() { return 0; }
 
 template <typename T, typename... Other>
 __attribute__((always_inline))
-int call(T f, Other... other) __arm_shared_za {
+int call(T f, Other... other) __arm_inout("za") {
     return f() + call(other...);
 }
 
@@ -270,7 +270,7 @@ int call(T f, Other... other) __arm_shared_za {
 // CHECK-NEXT: add nsw
 // CHECK-NEXT: add nsw
 // CHECK-NEXT: ret
-int test_variadic_template() __arm_shared_za {
+int test_variadic_template() __arm_inout("za") {
   return call(normal_callee,
               streaming_decl,
               streaming_compatible_decl,
@@ -286,18 +286,18 @@ int test_variadic_template() __arm_shared_za {
 // CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_pstate_za_new" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" }
 // CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" }
 // CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" }
 // CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_pstate_za_shared" }
-// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" }
+// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" }
 // CHECK: attributes #[[NOUNWIND_CALL]] = { nounwind }
 // CHECK: attributes #[[NOUNWIND_SM_ENABLED_CALL]] = { nounwind "aarch64_pstate_sm_enabled" }
 // CHECK: attributes #[[NOUNWIND_SM_COMPATIBLE_CALL]] = { nounwind "aarch64_pstate_sm_compatible" }
 // CHECK: attributes #[[NOUNWIND_ZA_SHARED_CALL]] = { nounwind "aarch64_pstate_za_shared" }
-// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_pstate_za_preserved" }
+// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" }
 

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index 55d2e355897f729..2ee14f6a7e882ce 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -50,7 +50,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -70,7 +70,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -90,7 +90,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn);
 }
 
@@ -110,7 +110,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -130,7 +130,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -150,7 +150,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -170,7 +170,7 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index 8e9c2e7da46a3ae..a0fb9bdb4e25a31 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -50,7 +50,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -70,7 +70,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -90,7 +90,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn);
 }
 
@@ -110,7 +110,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -130,7 +130,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stream
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -150,7 +150,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -170,7 +170,7 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streami
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
index e17782db222b60b..183a3986212bc37 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -22,7 +22,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_hor_za8(0, slice_base, pg, ptr);
   svld1_hor_za8(0, slice_base + 15, pg, ptr);
 }
@@ -45,7 +45,7 @@ void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_hor_za16(0, slice_base, pg, ptr);
   svld1_hor_za16(1, slice_base + 7, pg, ptr);
 }
@@ -68,7 +68,7 @@ void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_hor_za32(0, slice_base, pg, ptr);
   svld1_hor_za32(3, slice_base + 3, pg, ptr);
 }
@@ -91,7 +91,7 @@ void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_hor_za64(0, slice_base, pg, ptr);
   svld1_hor_za64(7, slice_base + 1, pg, ptr);
 }
@@ -112,7 +112,7 @@ void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_hor_za128(0, slice_base, pg, ptr);
   svld1_hor_za128(15, slice_base, pg, ptr);
 }
@@ -133,7 +133,7 @@ void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_ver_za8(0, slice_base, pg, ptr);
   svld1_ver_za8(0, slice_base + 15, pg, ptr);
 }
@@ -156,7 +156,7 @@ void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_ver_za16(0, slice_base, pg, ptr);
   svld1_ver_za16(1, slice_base + 7, pg, ptr);
 }
@@ -179,7 +179,7 @@ void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_ver_za32(0, slice_base, pg, ptr);
   svld1_ver_za32(3, slice_base + 3, pg, ptr);
 }
@@ -202,7 +202,7 @@ void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_ver_za64(0, slice_base, pg, ptr);
   svld1_ver_za64(7, slice_base + 1, pg, ptr);
 }
@@ -223,7 +223,7 @@ void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_shared_za {
+void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
   svld1_ver_za128(0, slice_base, pg, ptr);
   svld1_ver_za128(15, slice_base, pg, ptr);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
index 0fa77e1144a7d95..68a294b1a237a02 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -28,7 +28,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -57,7 +57,7 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -86,7 +86,7 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -115,7 +115,7 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -142,7 +142,7 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum);
   svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
@@ -169,7 +169,7 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -198,7 +198,7 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -227,7 +227,7 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -256,7 +256,7 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -283,7 +283,7 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
   svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum);
   svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index 314c9645dd4f7d3..56eb7f45f6d9abc 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -12,7 +12,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_shared_za {
+void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_out("za") {
   svldr_vnum_za(slice_base, ptr, 0);
 }
 
@@ -22,7 +22,7 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_shared_za {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_shared_za {
+void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_out("za") {
   svldr_vnum_za(slice_base, ptr, 15);
 }
 
@@ -32,7 +32,7 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_shared_za
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_shared_za {
+void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_out("za") {
   svldr_za(slice_base, ptr);
 }
 
@@ -43,7 +43,7 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_shared_za {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_shared_za {
+void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_out("za") {
   svldr_vnum_za(slice_base, ptr, vnum);
 }
 
@@ -53,6 +53,6 @@ void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_shared_za {
+void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_out("za") {
   svldr_vnum_za(slice_base, ptr, 16);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index e84f31c2dfa92a8..5015f9bc9f41edf 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,7 +42,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -62,7 +62,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -82,7 +82,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -102,7 +102,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -118,7 +118,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -134,7 +134,7 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index 1b22eb64e9e36ae..80457359e2a4e1a 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
@@ -50,7 +50,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -70,7 +70,7 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -110,7 +110,7 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index 0ff97ff92f7147e..7dad7289d98adcf 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,7 +42,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -62,7 +62,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -82,7 +82,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -102,7 +102,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -118,7 +118,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -134,7 +134,7 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN]], <vscale x 16 x i1> [[PM]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index 5b190a7f9b748e0..e95a5ea2e694194 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -30,7 +30,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
@@ -50,7 +50,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -70,7 +70,7 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -110,7 +110,7 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN]], <vscale x 8 x i16> [[ZM]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index 843a96da9027808..700564c1532e0d1 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -44,7 +44,7 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice);
 }
@@ -63,7 +63,7 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
      return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -83,7 +83,7 @@ svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
      uint32_t slice = slice_base + 7;
      return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice);
 }
@@ -102,7 +102,7 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -122,7 +122,7 @@ svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice);
 }
@@ -141,7 +141,7 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -161,7 +161,7 @@ svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice);
 }
@@ -178,7 +178,7 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -196,7 +196,7 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice);
 }
@@ -215,7 +215,7 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -235,7 +235,7 @@ svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice);
 }
@@ -254,7 +254,7 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -274,7 +274,7 @@ svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice);
 }
@@ -293,7 +293,7 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -313,7 +313,7 @@ svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice);
 }
@@ -332,7 +332,7 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -352,7 +352,7 @@ svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice);
 }
@@ -371,7 +371,7 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -391,7 +391,7 @@ svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice);
 }
@@ -410,7 +410,7 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -430,7 +430,7 @@ svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice);
 }
@@ -449,7 +449,7 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -469,7 +469,7 @@ svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice);
 }
@@ -486,7 +486,7 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -502,7 +502,7 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -520,7 +520,7 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -538,7 +538,7 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -556,7 +556,7 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -574,7 +574,7 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -592,7 +592,7 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -610,7 +610,7 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -626,7 +626,7 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -642,7 +642,7 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -660,7 +660,7 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -678,7 +678,7 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -696,7 +696,7 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -714,7 +714,7 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -732,7 +732,7 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -750,7 +750,7 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -768,7 +768,7 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -786,7 +786,7 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -804,7 +804,7 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -822,7 +822,7 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -840,7 +840,7 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -858,7 +858,7 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -876,7 +876,7 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -894,7 +894,7 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -910,7 +910,7 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -928,7 +928,7 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) _
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice);
 }
@@ -947,7 +947,7 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
      return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -967,7 +967,7 @@ svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
      uint32_t slice = slice_base + 7;
      return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice);
 }
@@ -986,7 +986,7 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1006,7 +1006,7 @@ svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice);
 }
@@ -1025,7 +1025,7 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1045,7 +1045,7 @@ svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice);
 }
@@ -1062,7 +1062,7 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1080,7 +1080,7 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 15;
     return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice);
 }
@@ -1099,7 +1099,7 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1119,7 +1119,7 @@ svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice);
 }
@@ -1138,7 +1138,7 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1158,7 +1158,7 @@ svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice);
 }
@@ -1177,7 +1177,7 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1197,7 +1197,7 @@ svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice);
 }
@@ -1216,7 +1216,7 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1236,7 +1236,7 @@ svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice);
 }
@@ -1255,7 +1255,7 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1275,7 +1275,7 @@ svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 7;
     return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice);
 }
@@ -1294,7 +1294,7 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1314,7 +1314,7 @@ svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 3;
     return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice);
 }
@@ -1333,7 +1333,7 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1353,7 +1353,7 @@ svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     uint32_t slice = slice_base + 1;
     return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice);
 }
@@ -1370,7 +1370,7 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1386,7 +1386,7 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1404,7 +1404,7 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1422,7 +1422,7 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1440,7 +1440,7 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1458,7 +1458,7 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1476,7 +1476,7 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1494,7 +1494,7 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1510,7 +1510,7 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1526,7 +1526,7 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD]], <vscale x 16 x i1> [[PG]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1544,7 +1544,7 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1562,7 +1562,7 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1580,7 +1580,7 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1598,7 +1598,7 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1616,7 +1616,7 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1634,7 +1634,7 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1652,7 +1652,7 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1670,7 +1670,7 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1688,7 +1688,7 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1706,7 +1706,7 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1724,7 +1724,7 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1742,7 +1742,7 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
@@ -1760,7 +1760,7 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
@@ -1778,7 +1778,7 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_shared_za {
+svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
     return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
index 98ebbefc2e74ccd..97e20ab2629023c 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -22,7 +22,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_hor_za8(0, slice_base, pg, ptr);
   svst1_hor_za8(0, slice_base + 15, pg, ptr);
 }
@@ -45,7 +45,7 @@ void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_hor_za16(0, slice_base, pg, ptr);
   svst1_hor_za16(1, slice_base + 7, pg, ptr);
 }
@@ -68,7 +68,7 @@ void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_hor_za32(0, slice_base, pg, ptr);
   svst1_hor_za32(3, slice_base + 3, pg, ptr);
 }
@@ -91,7 +91,7 @@ void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_hor_za64(0, slice_base, pg, ptr);
   svst1_hor_za64(7, slice_base + 1, pg, ptr);
 }
@@ -112,7 +112,7 @@ void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_hor_za128(0, slice_base, pg, ptr);
   svst1_hor_za128(15, slice_base, pg, ptr);
 }
@@ -133,7 +133,7 @@ void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_str
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_ver_za8(0, slice_base, pg, ptr);
   svst1_ver_za8(0, slice_base + 15, pg, ptr);
 }
@@ -156,7 +156,7 @@ void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_strea
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_ver_za16(0, slice_base, pg, ptr);
   svst1_ver_za16(1, slice_base + 7, pg, ptr);
 }
@@ -179,7 +179,7 @@ void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_ver_za32(0, slice_base, pg, ptr);
   svst1_ver_za32(3, slice_base + 3, pg, ptr);
 }
@@ -202,7 +202,7 @@ void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_ver_za64(0, slice_base, pg, ptr);
   svst1_ver_za64(7, slice_base + 1, pg, ptr);
 }
@@ -223,7 +223,7 @@ void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_stre
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
   svst1_ver_za128(0, slice_base, pg, ptr);
   svst1_ver_za128(15, slice_base, pg, ptr);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
index 938e62a15c7716c..7566ad8889e0584 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -28,7 +28,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -57,7 +57,7 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -86,7 +86,7 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -115,7 +115,7 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -142,7 +142,7 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum);
   svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
@@ -169,7 +169,7 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
@@ -198,7 +198,7 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
@@ -227,7 +227,7 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
@@ -256,7 +256,7 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[ADD]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
@@ -283,7 +283,7 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_shared_za {
+void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
   svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum);
   svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
index 282819c8ca3501a..c3e4967bfe9b1bd 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -66,7 +66,7 @@ bool test_has_sme(void) __arm_streaming_compatible {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svundef_za(void) __arm_streaming_compatible __arm_shared_za {
+void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") {
   svundef_za();
 }
 

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index bcf368bc8dce464..d21c1ce7a8cd9f2 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -12,7 +12,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_shared_za {
+void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_in("za") {
   svstr_vnum_za(slice_base, ptr, 0);
 }
 
@@ -22,7 +22,7 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_shared_za {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_shared_za {
+void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_in("za") {
   svstr_vnum_za(slice_base, ptr, 15);
 }
 
@@ -32,7 +32,7 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_shared_za {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_za(uint32_t slice_base, void *ptr) __arm_shared_za {
+void test_svstr_za(uint32_t slice_base, void *ptr) __arm_in("za") {
   svstr_za(slice_base, ptr);
 }
 
@@ -43,7 +43,7 @@ void test_svstr_za(uint32_t slice_base, void *ptr) __arm_shared_za {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_shared_za {
+void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_in("za") {
   svstr_vnum_za(slice_base, ptr, vnum);
 }
 
@@ -53,6 +53,6 @@ void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_shared_za {
+void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_in("za") {
   svstr_vnum_za(slice_base, ptr, 16);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index 38c8402c3d0fa7e..5ddd90b95ce89c4 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -26,7 +26,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -44,7 +44,7 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
    uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn);
 }
@@ -63,7 +63,7 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -83,7 +83,7 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn);
 }
@@ -102,7 +102,7 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -122,7 +122,7 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn);
 }
@@ -141,7 +141,7 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -161,7 +161,7 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn);
 }
@@ -178,7 +178,7 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -196,7 +196,7 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn);
 }
@@ -215,7 +215,7 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -235,7 +235,7 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn);
 }
@@ -254,7 +254,7 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -274,7 +274,7 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn);
 }
@@ -293,7 +293,7 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -313,7 +313,7 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn);
 }
@@ -332,7 +332,7 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -352,7 +352,7 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn);
 }
@@ -371,7 +371,7 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -391,7 +391,7 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
    uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn);
 }
@@ -410,7 +410,7 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -430,7 +430,7 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn);
 }
@@ -449,7 +449,7 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -469,7 +469,7 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn);
 }
@@ -486,7 +486,7 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -502,7 +502,7 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
@@ -520,7 +520,7 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -538,7 +538,7 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
@@ -556,7 +556,7 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -574,7 +574,7 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
@@ -592,7 +592,7 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -610,7 +610,7 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
@@ -626,7 +626,7 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -642,7 +642,7 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
@@ -660,7 +660,7 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -678,7 +678,7 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
@@ -696,7 +696,7 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -714,7 +714,7 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
@@ -732,7 +732,7 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -750,7 +750,7 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
@@ -768,7 +768,7 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -786,7 +786,7 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
@@ -804,7 +804,7 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -822,7 +822,7 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
@@ -840,7 +840,7 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -858,7 +858,7 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
@@ -876,7 +876,7 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -894,7 +894,7 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 
@@ -910,7 +910,7 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -928,7 +928,7 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __ar
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn);
 }
@@ -947,7 +947,7 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -967,7 +967,7 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn);
 }
@@ -986,7 +986,7 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1006,7 +1006,7 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn);
 }
@@ -1025,7 +1025,7 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1045,7 +1045,7 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn);
 }
@@ -1062,7 +1062,7 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1080,7 +1080,7 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __a
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[ADD]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 15;
   SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn);
 }
@@ -1099,7 +1099,7 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1119,7 +1119,7 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn);
 }
@@ -1138,7 +1138,7 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1158,7 +1158,7 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn);
 }
@@ -1177,7 +1177,7 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1197,7 +1197,7 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn);
 }
@@ -1216,7 +1216,7 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1236,7 +1236,7 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn);
 }
@@ -1255,7 +1255,7 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1275,7 +1275,7 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[ADD]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 7;
   SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn);
 }
@@ -1294,7 +1294,7 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1314,7 +1314,7 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[ADD]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 3;
   SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn);
 }
@@ -1333,7 +1333,7 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1353,7 +1353,7 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[ADD]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   uint32_t slice = slice_base + 1;
   SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn);
 }
@@ -1370,7 +1370,7 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1386,7 +1386,7 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
@@ -1404,7 +1404,7 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1422,7 +1422,7 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1440,7 +1440,7 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1458,7 +1458,7 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1476,7 +1476,7 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1494,7 +1494,7 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
@@ -1510,7 +1510,7 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
@@ -1526,7 +1526,7 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) _
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE]], <vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
@@ -1544,7 +1544,7 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1562,7 +1562,7 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1580,7 +1580,7 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1598,7 +1598,7 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1616,7 +1616,7 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1634,7 +1634,7 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
@@ -1652,7 +1652,7 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1670,7 +1670,7 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1688,7 +1688,7 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
@@ -1706,7 +1706,7 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
@@ -1724,7 +1724,7 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
@@ -1742,7 +1742,7 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
@@ -1760,7 +1760,7 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
@@ -1778,7 +1778,7 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN]])
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
index ddd960236953892..658092bc33ff918 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -18,7 +18,7 @@
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 0)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_mask_za(void) __arm_shared_za {
+void test_svzero_mask_za(void) __arm_inout("za") {
   svzero_mask_za(0);
 }
 
@@ -34,7 +34,7 @@ void test_svzero_mask_za(void) __arm_shared_za {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 176)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_mask_za_1(void) __arm_shared_za {
+void test_svzero_mask_za_1(void) __arm_inout("za") {
   svzero_mask_za(176);
 }
 
@@ -50,7 +50,7 @@ void test_svzero_mask_za_1(void) __arm_shared_za {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_mask_za_2(void) __arm_shared_za {
+void test_svzero_mask_za_2(void) __arm_inout("za") {
   svzero_mask_za(255);
 }
 
@@ -66,7 +66,7 @@ void test_svzero_mask_za_2(void) __arm_shared_za {
 // CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
 // CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_za(void) __arm_shared_za {
+void test_svzero_za(void) __arm_out("za") {
   svzero_za();
 }
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
index 46b4a0386502e30..311f965786137ac 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
@@ -36,7 +36,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -54,7 +54,7 @@ void test_svadd_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -72,7 +72,7 @@ void test_svadd_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svadd_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -114,7 +114,7 @@ void test_svadd_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -136,7 +136,7 @@ void test_svadd_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -158,7 +158,7 @@ void test_svadd_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -180,7 +180,7 @@ void test_svadd_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -208,7 +208,7 @@ void test_svadd_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -230,7 +230,7 @@ void test_svadd_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -252,7 +252,7 @@ void test_svadd_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -274,7 +274,7 @@ void test_svadd_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -306,7 +306,7 @@ void test_svadd_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -336,7 +336,7 @@ void test_svadd_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -366,7 +366,7 @@ void test_svadd_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -396,7 +396,7 @@ void test_svadd_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za {
+void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -420,7 +420,7 @@ void test_svadd_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x2)(slice_base, zn);
 }
 
@@ -438,7 +438,7 @@ void test_svadd_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x2)(slice_base , zn);
 }
 
@@ -456,7 +456,7 @@ void test_svadd_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x2)(slice_base, zn);
 }
 
@@ -474,7 +474,7 @@ void test_svadd_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x2)(slice_base, zn);
 }
 
@@ -492,7 +492,7 @@ void test_svadd_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x2)(slice_base, zn);
 }
 
@@ -510,7 +510,7 @@ void test_svadd_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x2)(slice_base, zn);
 }
 
@@ -534,7 +534,7 @@ void test_svadd_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za32,,_f32,,_vg1x4)(slice_base, zn);
 }
 
@@ -556,7 +556,7 @@ void test_svadd_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za32,,_s32,,_vg1x4)(slice_base, zn);
 }
 
@@ -578,7 +578,7 @@ void test_svadd_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za32,,_u32,,_vg1x4)(slice_base, zn);
 }
 
@@ -600,7 +600,7 @@ void test_svadd_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za64,,_f64,,_vg1x4)(slice_base, zn);
 }
 
@@ -622,7 +622,7 @@ void test_svadd_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za64,,_s64,,_vg1x4)(slice_base, zn);
 }
 
@@ -644,6 +644,6 @@ void test_svadd_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za {
+void test_svadd_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svadd_za64,,_u64,,_vg1x4)(slice_base, zn);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
index 67a330625884f87..e6415c66fbb4885 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
@@ -33,7 +33,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svbmopa_u32(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+void test_svbmopa_u32(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svbmopa_za32,_u32,_m,)(3, pn, pm, zn, zm);
 }
 
@@ -51,7 +51,7 @@ void test_svbmopa_u32(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svbmopa_s32(svbool_t pn, svbool_t pm, svint32_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+void test_svbmopa_s32(svbool_t pn, svbool_t pm, svint32_t zn, svint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svbmopa_za32,_s32,_m,)(3, pn, pm, zn, zm);
 }
 
@@ -71,7 +71,7 @@ void test_svbmopa_s32(svbool_t pn, svbool_t pm, svint32_t zn, svint32_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svbmops_u32(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+void test_svbmops_u32(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svbmops_za32,_u32,_m,)(3, pn, pm, zn, zm);
 }
 
@@ -89,6 +89,6 @@ void test_svbmops_u32(svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svbmops_s32(svbool_t pn, svbool_t pm, svint32_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+void test_svbmops_s32(svbool_t pn, svbool_t pm, svint32_t zn, svint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svbmops_za32,_s32,_m,)(3, pn, pm, zn, zm);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
index ff4176530710af5..c4651f4d83e1f99 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
@@ -35,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_f16,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -65,7 +65,7 @@ void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfl
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_f16,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -86,7 +86,7 @@ void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfl
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_f16,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -108,7 +108,7 @@ void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_f16,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -129,7 +129,7 @@ void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,,_f16,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -151,7 +151,7 @@ void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,,_f16,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -176,7 +176,7 @@ void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_bf16,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -206,7 +206,7 @@ void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, sv
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_bf16,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -227,7 +227,7 @@ void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, sv
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_bf16,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -249,7 +249,7 @@ void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, s
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_bf16,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -270,7 +270,7 @@ void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, s
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,_bf16,,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -292,6 +292,6 @@ void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svb
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,_bf16,,_vg1x4)(slice_base, zn, zm, 3);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
index 0d85071b7fc3e62..6e0e9433bae2fe0 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
@@ -35,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_u16,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -65,7 +65,7 @@ void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_u16,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -87,7 +87,7 @@ void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_u8,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -117,7 +117,7 @@ void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_u8,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -139,7 +139,7 @@ void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za64,,,_u16,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -169,7 +169,7 @@ void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za64,,,_u16,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -194,7 +194,7 @@ void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_s16,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -224,7 +224,7 @@ void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint1
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_s16,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -246,7 +246,7 @@ void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint1
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_s8,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -276,7 +276,7 @@ void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za32,,,_s8,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -298,7 +298,7 @@ void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za64,,,_s16,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -328,7 +328,7 @@ void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint1
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_za64,,,_s16,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -349,7 +349,7 @@ void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint1
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_u16,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -371,7 +371,7 @@ void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svui
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_u16,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -389,7 +389,7 @@ void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svui
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_u8,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -411,7 +411,7 @@ void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_u8,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -429,7 +429,7 @@ void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za64,,_u16,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -451,7 +451,7 @@ void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svui
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za64,,_u16,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -472,7 +472,7 @@ void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svui
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_s16,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -494,7 +494,7 @@ void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_s16,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -512,7 +512,7 @@ void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -534,7 +534,7 @@ void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -552,7 +552,7 @@ void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za64,,_s16,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -574,7 +574,7 @@ void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_single_za64,,_s16,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -594,7 +594,7 @@ void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,_u16,,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -616,7 +616,7 @@ void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,_u16,,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -634,7 +634,7 @@ void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,_u8,,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -656,7 +656,7 @@ void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,_u8,,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -674,7 +674,7 @@ void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za64,,,_u16,_vg1x2)(slice_base, zn, zm, 1);
 }
 
@@ -696,7 +696,7 @@ void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za64,,,_u16,_vg1x4)(slice_base, zn, zm, 1);
 }
 
@@ -717,7 +717,7 @@ void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,,_s16,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -739,7 +739,7 @@ void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,,_s16,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -757,7 +757,7 @@ void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,,_s8,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -779,7 +779,7 @@ void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za32,,,_s8,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -797,7 +797,7 @@ void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za64,,,_s16,_vg1x2)(slice_base, zn, zm, 1);
 }
 
@@ -819,7 +819,7 @@ void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svdot_lane_za64,,,_s16,_vg1x4)(slice_base, zn, zm, 1);
 }
 
@@ -844,7 +844,7 @@ void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za {
+void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusdot_za32,,,_u8,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -874,7 +874,7 @@ void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za {
+void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusdot_za32,,,_u8,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -895,7 +895,7 @@ void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusdot_single_za32,,_u8,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -917,7 +917,7 @@ void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusdot_single_za32,,_u8,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -937,7 +937,7 @@ void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusdot_lane_za32,,_u8,,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -959,7 +959,7 @@ void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusdot_lane_za32,,_u8,,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -980,7 +980,7 @@ void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsudot_single_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -1002,7 +1002,7 @@ void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsudot_single_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -1026,7 +1026,7 @@ void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsudot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za {
+void test_svsudot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsudot_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -1056,7 +1056,7 @@ void test_svsudot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsudot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za {
+void test_svsudot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsudot_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -1076,7 +1076,7 @@ void test_svsudot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsudot_lane_za32,,_s8,,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -1098,6 +1098,6 @@ void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsudot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsudot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsudot_lane_za32,,_s8,,_vg1x4)(slice_base, zn, zm, 3);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
index 83fbd6e5855caa1..2e3cbd33d1e2c2d 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
@@ -20,7 +20,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_shared_za {
+void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("za") {
   svldr_zt(0, base);
 }
 
@@ -36,6 +36,6 @@ void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_shared_za
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstr_zt(void *base) __arm_streaming_compatible __arm_shared_za __arm_preserves_za {
+void test_svstr_zt(void *base) __arm_streaming_compatible __arm_in("za") {
   svstr_zt(0, base);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
index cb34db3695b2428..a31c6d982100bfc 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -19,7 +19,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u8(0, zn, 15);
 }
 
@@ -34,7 +34,7 @@ svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s8(0, zn, 15);
 }
 
@@ -48,7 +48,7 @@ svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za _
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u16(0, zn, 15);
 }
 
@@ -63,7 +63,7 @@ svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s16(0, zn, 15);
 }
 
@@ -77,7 +77,7 @@ svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.luti2.lane.zt.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_f16(0, zn, 15);
 }
 
@@ -91,7 +91,7 @@ svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.luti2.lane.zt.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_bf16(0, zn, 15);
 }
 
@@ -105,7 +105,7 @@ svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u32(0, zn, 15);
 }
 
@@ -119,7 +119,7 @@ svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s32(0, zn, 15);
 }
 
@@ -133,6 +133,6 @@ svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.luti2.lane.zt.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_f32(0, zn, 15);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index 04f37af46767ab1..db44f52a37bf063 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -26,7 +26,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u8_x2(0, zn, 7);
 }
 
@@ -49,7 +49,7 @@ svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s8_x2(0, zn, 7);
 }
 
@@ -71,7 +71,7 @@ svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u16_x2(0, zn, 7);
 }
 
@@ -94,7 +94,7 @@ svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s16_x2(0, zn, 7);
 }
 
@@ -116,7 +116,7 @@ svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_f16_x2(0, zn, 7);
 }
 
@@ -138,7 +138,7 @@ svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_bf16_x2(0, zn, 7);
 }
 
@@ -160,7 +160,7 @@ svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u32_x2(0, zn, 7);
 }
 
@@ -182,7 +182,7 @@ svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s32_x2(0, zn, 7);
 }
 
@@ -204,6 +204,6 @@ svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_f32_x2(0, zn, 7);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index 8c38d829a7f4cab..23b2c6cc51283a7 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -34,7 +34,7 @@
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u8_x4(0, zn, 3);
 }
 
@@ -65,7 +65,7 @@ svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s8_x4(0, zn, 3);
 }
 
@@ -95,7 +95,7 @@ svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u16_x4(0, zn, 3);
 }
 
@@ -125,7 +125,7 @@ svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s16_x4(0, zn, 3);
 }
 
@@ -155,7 +155,7 @@ svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_f16_x4(0, zn, 3);
 }
 
@@ -185,7 +185,7 @@ svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_bf16_x4(0, zn, 3);
 }
 
@@ -215,7 +215,7 @@ svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_u32_x4(0, zn, 3);
 }
 
@@ -245,7 +245,7 @@ svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_s32_x4(0, zn, 3);
 }
 
@@ -275,6 +275,6 @@ svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x4_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti2_lane_zt_f32_x4(0, zn, 3);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
index 9815b0e825b3032..c949cee1124ec06 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
@@ -19,7 +19,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u8(0, zn, 7);
 }
 
@@ -34,7 +34,7 @@ svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s8(0, zn, 7);
 }
 
@@ -48,7 +48,7 @@ svint8_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za _
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u16(0, zn, 7);
 }
 
@@ -62,7 +62,7 @@ svuint16_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s16(0, zn, 7);
 }
 
@@ -76,7 +76,7 @@ svint16_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.luti4.lane.zt.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_f16(0, zn, 7);
 }
 
@@ -90,7 +90,7 @@ svfloat16_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.luti4.lane.zt.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_bf16(0, zn, 7);
 }
 
@@ -104,7 +104,7 @@ svbfloat16_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u32(0, zn, 7);
 }
 
@@ -118,7 +118,7 @@ svuint32_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s32(0, zn, 7);
 }
 
@@ -132,6 +132,6 @@ svint32_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.luti4.lane.zt.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_f32(0, zn, 7);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index 4c181dd9123c568..61affd86d911960 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -26,7 +26,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u8_x2(0, zn, 3);
 }
 
@@ -49,7 +49,7 @@ svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_z
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s8_x2(0, zn, 3);
 }
 
@@ -71,7 +71,7 @@ svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_shared_za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u16_x2(0, zn, 3);
 }
 
@@ -94,7 +94,7 @@ svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s16_x2(0, zn, 3);
 }
 
@@ -116,7 +116,7 @@ svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_f16_x2(0, zn, 3);
 }
 
@@ -138,7 +138,7 @@ svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_bf16_x2(0, zn, 3);
 }
 
@@ -160,7 +160,7 @@ svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u32_x2(0, zn, 3);
 }
 
@@ -182,7 +182,7 @@ svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s32_x2(0, zn, 3);
 }
 
@@ -204,6 +204,6 @@ svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x2_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_f32_x2(0, zn, 3);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index 9baccef888d5840..7b478aa2b6ad42a 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -36,7 +36,7 @@
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u16_x4(0, zn, 1);
 }
 
@@ -68,7 +68,7 @@ svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_f16_x4(0, zn, 1);
 }
 
@@ -100,7 +100,7 @@ svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_bf16_x4(0, zn, 1);
 }
 
@@ -132,7 +132,7 @@ svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s16_x4(0, zn, 1);
 }
 
@@ -164,7 +164,7 @@ svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_u32_x4(0, zn, 1);
 }
 
@@ -196,7 +196,7 @@ svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_s32_x4(0, zn, 1);
 }
 
@@ -228,6 +228,6 @@ svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x4_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
   return svluti4_lane_zt_f32_x4(0, zn, 1);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
index 2679f9cc8dfd0cd..18c4b77ac3292c0 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -35,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x2,,)(slice_base, zn, zm);
 }
 
@@ -65,7 +65,7 @@ void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x4,,)(slice_base, zn, zm);
 }
 
@@ -85,7 +85,7 @@ void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -107,7 +107,7 @@ void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla,_single,_za32,_f32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -127,7 +127,7 @@ void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x2,,)(slice_base, zn, zm, 3);
 }
 
@@ -149,7 +149,7 @@ void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x4,,)(slice_base, zn, zm, 3);
 }
 
@@ -173,7 +173,7 @@ void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x2,,)(slice_base, zn, zm);
 }
 
@@ -203,7 +203,7 @@ void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x4,,)(slice_base, zn, zm);
 }
 
@@ -223,7 +223,7 @@ void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -245,7 +245,7 @@ void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla,_single,_za64,_f64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -265,7 +265,7 @@ void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x2,,)(slice_base, zn, zm, 1);
 }
 
@@ -287,6 +287,6 @@ void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x4,,)(slice_base, zn, zm, 1);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
index 032830512987af4..4b785cb86df6461 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -35,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -58,7 +58,7 @@ void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -81,7 +81,7 @@ void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -104,7 +104,7 @@ void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -135,7 +135,7 @@ void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -166,7 +166,7 @@ void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -197,7 +197,7 @@ void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -228,7 +228,7 @@ void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -245,7 +245,7 @@ void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -260,7 +260,7 @@ void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -275,7 +275,7 @@ void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -290,7 +290,7 @@ void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -309,7 +309,7 @@ void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x2)(slice_base, zn, zm);
 }
@@ -328,7 +328,7 @@ void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x2)(slice_base, zn, zm);
 }
@@ -347,7 +347,7 @@ void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x2)(slice_base, zn, zm);
 }
@@ -366,7 +366,7 @@ void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x2)(slice_base, zn, zm);
 }
@@ -389,7 +389,7 @@ void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_f16,_vg2x4)(slice_base, zn, zm);
 }
@@ -412,7 +412,7 @@ void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_bf16,_vg2x4)(slice_base, zn, zm);
 }
@@ -435,7 +435,7 @@ void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_u16,_vg2x4)(slice_base, zn, zm);
 }
@@ -458,7 +458,7 @@ void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla,_single,_za32,_s16,_vg2x4)(slice_base, zn, zm);
 }
@@ -477,7 +477,7 @@ void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_f16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -492,7 +492,7 @@ void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_bf16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -507,7 +507,7 @@ void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_u16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -522,7 +522,7 @@ void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_s16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -541,7 +541,7 @@ void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_f16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -560,7 +560,7 @@ void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_bf16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -579,7 +579,7 @@ void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_u16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -598,7 +598,7 @@ void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_s16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -621,7 +621,7 @@ void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_f16,_vg2x4,,)(slice_base, zn, zm, 7);
 }
@@ -644,7 +644,7 @@ void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_bf16,_vg2x4,,)(slice_base, zn, zm, 7);
 }
@@ -667,7 +667,7 @@ void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_u16,_vg2x4,,)(slice_base, zn, zm, 7);
 }
@@ -690,7 +690,7 @@ void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmla_lane_za32,_s16,_vg2x4,,)(slice_base, zn, zm, 7);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
index 44e715e9581b7cf..50cbe3e4bac5ba8 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -31,7 +31,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -46,7 +46,7 @@ void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -61,7 +61,7 @@ void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -76,7 +76,7 @@ void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -93,7 +93,7 @@ void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -108,7 +108,7 @@ void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -123,7 +123,7 @@ void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -138,7 +138,7 @@ void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -155,7 +155,7 @@ void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -172,7 +172,7 @@ void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
 }
@@ -197,7 +197,7 @@ void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
 }
@@ -216,7 +216,7 @@ void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x2)(slice_base, zn, zm);
 }
@@ -235,7 +235,7 @@ void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
 }
@@ -254,7 +254,7 @@ void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x2)(slice_base, zn, zm);
 }
@@ -275,7 +275,7 @@ void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
 }
@@ -294,7 +294,7 @@ void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x2)(slice_base, zn, zm);
 }
@@ -313,7 +313,7 @@ void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
 }
@@ -332,7 +332,7 @@ void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x2)(slice_base, zn, zm);
 }
@@ -353,7 +353,7 @@ void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x2)(slice_base, zn, zm);
 }
@@ -374,7 +374,7 @@ void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x2)(slice_base, zn, zm);
 }
@@ -403,7 +403,7 @@ void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
 }
@@ -426,7 +426,7 @@ void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za64,_s16,_vg4x4)(slice_base, zn, zm);
 }
@@ -449,7 +449,7 @@ void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
 }
@@ -472,7 +472,7 @@ void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla,_single,_za64,_u16,_vg4x4)(slice_base, zn, zm);
 }
@@ -497,7 +497,7 @@ void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
 }
@@ -520,7 +520,7 @@ void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za64,_s16,_vg4x4)(slice_base, zn, zm);
 }
@@ -543,7 +543,7 @@ void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
 }
@@ -566,7 +566,7 @@ void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls,_single,_za64,_u16,_vg4x4)(slice_base, zn, zm);
 }
@@ -591,7 +591,7 @@ void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla,_single,_za32,_s8,_vg4x4)(slice_base, zn, zm);
 }
@@ -616,7 +616,7 @@ void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla,_single,_za32,_u8,_vg4x4)(slice_base, zn, zm);
 }
@@ -645,7 +645,7 @@ void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -668,7 +668,7 @@ void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -691,7 +691,7 @@ void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -714,7 +714,7 @@ void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -739,7 +739,7 @@ void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -762,7 +762,7 @@ void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -785,7 +785,7 @@ void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -808,7 +808,7 @@ void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -833,7 +833,7 @@ void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -858,7 +858,7 @@ void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
 }
@@ -895,7 +895,7 @@ void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -926,7 +926,7 @@ void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -957,7 +957,7 @@ void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -988,7 +988,7 @@ void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1021,7 +1021,7 @@ void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1052,7 +1052,7 @@ void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1083,7 +1083,7 @@ void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1114,7 +1114,7 @@ void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1147,7 +1147,7 @@ void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1180,7 +1180,7 @@ void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
 }
@@ -1201,7 +1201,7 @@ void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
@@ -1216,7 +1216,7 @@ void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
@@ -1231,7 +1231,7 @@ void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
@@ -1246,7 +1246,7 @@ void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
@@ -1263,7 +1263,7 @@ void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
@@ -1278,7 +1278,7 @@ void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
@@ -1293,7 +1293,7 @@ void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
@@ -1308,7 +1308,7 @@ void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
 }
@@ -1325,7 +1325,7 @@ void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
@@ -1342,7 +1342,7 @@ void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
 }
@@ -1367,7 +1367,7 @@ void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
@@ -1386,7 +1386,7 @@ void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
@@ -1405,7 +1405,7 @@ void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
@@ -1424,7 +1424,7 @@ void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
@@ -1445,7 +1445,7 @@ void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
@@ -1464,7 +1464,7 @@ void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
@@ -1483,7 +1483,7 @@ void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
@@ -1502,7 +1502,7 @@ void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
 }
@@ -1523,7 +1523,7 @@ void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
@@ -1542,7 +1542,7 @@ void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
 }
@@ -1571,7 +1571,7 @@ void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
@@ -1594,7 +1594,7 @@ void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
@@ -1617,7 +1617,7 @@ void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
@@ -1640,7 +1640,7 @@ void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
@@ -1665,7 +1665,7 @@ void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
@@ -1688,7 +1688,7 @@ void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
@@ -1711,7 +1711,7 @@ void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
@@ -1734,7 +1734,7 @@ void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
 }
@@ -1759,7 +1759,7 @@ void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
 }
@@ -1784,7 +1784,7 @@ void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za")
 {
   SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
index fd4bf390b8232e8..98eaa929cfb7721 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -35,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_za32,_f32,_vg1x2,,)(slice_base, zn, zm);
 }
 
@@ -65,7 +65,7 @@ void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_za32,_f32,_vg1x4,,)(slice_base, zn, zm);
 }
 
@@ -85,7 +85,7 @@ void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -107,7 +107,7 @@ void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls,_single,_za32,_f32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -127,7 +127,7 @@ void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_lane_za32,_f32,_vg1x2,,)(slice_base, zn, zm, 3);
 }
 
@@ -149,7 +149,7 @@ void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_lane_za32,_f32,_vg1x4,,)(slice_base, zn, zm, 3);
 }
 
@@ -173,7 +173,7 @@ void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_za64,_f64,_vg1x2,,)(slice_base, zn, zm);
 }
 
@@ -203,7 +203,7 @@ void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_za64,_f64,_vg1x4,,)(slice_base, zn, zm);
 }
 
@@ -223,7 +223,7 @@ void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -245,7 +245,7 @@ void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls,_single,_za64,_f64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -265,7 +265,7 @@ void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_lane_za64,_f64,_vg1x2,,)(slice_base, zn, zm, 1);
 }
 
@@ -287,6 +287,6 @@ void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmls_lane_za64,_f64,_vg1x4,,)(slice_base, zn, zm, 1);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
index afc7db797c9dfce..ea685079ebbcabc 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -35,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -58,7 +58,7 @@ void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -81,7 +81,7 @@ void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -104,7 +104,7 @@ void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
 }
@@ -135,7 +135,7 @@ void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -166,7 +166,7 @@ void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -197,7 +197,7 @@ void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -228,7 +228,7 @@ void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __ar
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
 }
@@ -245,7 +245,7 @@ void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -260,7 +260,7 @@ void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -275,7 +275,7 @@ void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -290,7 +290,7 @@ void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
 }
@@ -309,7 +309,7 @@ void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x2)(slice_base, zn, zm);
 }
@@ -328,7 +328,7 @@ void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x2)(slice_base, zn, zm);
 }
@@ -347,7 +347,7 @@ void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x2)(slice_base, zn, zm);
 }
@@ -366,7 +366,7 @@ void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x2)(slice_base, zn, zm);
 }
@@ -389,7 +389,7 @@ void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_f16,_vg2x4)(slice_base, zn, zm);
 }
@@ -412,7 +412,7 @@ void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t z
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_bf16,_vg2x4)(slice_base, zn, zm);
 }
@@ -435,7 +435,7 @@ void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_u16,_vg2x4)(slice_base, zn, zm);
 }
@@ -458,7 +458,7 @@ void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls,_single,_za32,_s16,_vg2x4)(slice_base, zn, zm);
 }
@@ -477,7 +477,7 @@ void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_f16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -492,7 +492,7 @@ void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_bf16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -507,7 +507,7 @@ void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_u16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -522,7 +522,7 @@ void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_s16,_vg2x1,,)(slice_base, zn, zm, 7);
 }
@@ -541,7 +541,7 @@ void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_f16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -560,7 +560,7 @@ void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_bf16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -579,7 +579,7 @@ void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_u16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -598,7 +598,7 @@ void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_s16,_vg2x2,,)(slice_base, zn, zm, 7);
 }
@@ -621,7 +621,7 @@ void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_f16,_vg2x4,,)(slice_base, zn, zm, 7);
 }
@@ -644,7 +644,7 @@ void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_bf16,_vg2x4,,)(slice_base, zn, zm, 7);
 }
@@ -667,7 +667,7 @@ void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_u16,_vg2x4,,)(slice_base, zn, zm, 7);
 }
@@ -690,7 +690,7 @@ void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za")
 {
    SVE_ACLE_FUNC(svmls_lane_za32,_s16,_vg2x4,,)(slice_base, zn, zm, 7);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
index eef99eb1b75a374..a2904dc1a602505 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
@@ -33,7 +33,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmopa_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmopa_za32,_s16,_m,)(3, pn, pm, zn, zm);
 }
 
@@ -51,7 +51,7 @@ void test_svmopa_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmopa_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmopa_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmopa_za32,_u16,_m,)(3, pn, pm, zn, zm);
 }
 
@@ -71,7 +71,7 @@ void test_svmopa_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __a
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmops_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmops_za32,_s16,_m,)(3, pn, pm, zn, zm);
 }
 
@@ -89,6 +89,6 @@ void test_svmops_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svmops_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svmops_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svmops_za32,_u16,_m,)(3, pn, pm, zn, zm);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
index 583a7fc8154728d..028a3d7b155dd55 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
@@ -25,7 +25,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_u8_vg2(0, base);
 }
 
@@ -47,7 +47,7 @@ svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_s8_vg2(0, base);
 }
 
@@ -69,7 +69,7 @@ svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_u8_vg2(0, base);
 }
 
@@ -91,7 +91,7 @@ svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_s8_vg2(0, base);
 }
 
@@ -121,7 +121,7 @@ svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_u8_vg4(0, base);
 }
 
@@ -151,7 +151,7 @@ svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za8_s8_vg4(0, base);
 }
 
@@ -181,7 +181,7 @@ svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_u8_vg4(0, base);
 }
 
@@ -211,7 +211,7 @@ svuint8x4_t test_svread_ver_za8_u8_vg4(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za8_s8_vg4(0, base);
 }
 
@@ -233,7 +233,7 @@ svint8x4_t test_svread_ver_za8_s8_vg4(uint32_t base) __arm_streaming __arm_share
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_u16_vg2(1, base);
 }
 
@@ -255,7 +255,7 @@ svuint16x2_t test_svread_hor_za16_u16_vg2(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_bf16_vg2(1, base);
 }
 
@@ -277,7 +277,7 @@ svbfloat16x2_t test_svread_hor_za16_bf16_vg2(uint32_t base) __arm_streaming __ar
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_f16_vg2(1, base);
 }
 
@@ -299,7 +299,7 @@ svfloat16x2_t test_svread_hor_za16_f16_vg2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_s16_vg2(1, base);
 }
 
@@ -321,7 +321,7 @@ svint16x2_t test_svread_hor_za16_s16_vg2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_u16_vg2(1, base);
 }
 
@@ -343,7 +343,7 @@ svuint16x2_t test_svread_ver_za16_u16_vg2(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_bf16_vg2(1, base);
 }
 
@@ -365,7 +365,7 @@ svbfloat16x2_t test_svread_ver_za16_bf16_vg2(uint32_t base) __arm_streaming __ar
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_f16_vg2(1, base);
 }
 
@@ -387,7 +387,7 @@ svfloat16x2_t test_svread_ver_za16_f16_vg2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_s16_vg2(1, base);
 }
 
@@ -417,7 +417,7 @@ svint16x2_t test_svread_ver_za16_s16_vg2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_u16_vg4(1, base);
 }
 
@@ -447,7 +447,7 @@ svuint16x4_t test_svread_hor_za16_u16_vg4(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_bf16_vg4(1, base);
 }
 
@@ -477,7 +477,7 @@ svbfloat16x4_t test_svread_hor_za16_bf16_vg4(uint32_t base) __arm_streaming __ar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_f16_vg4(1, base);
 }
 
@@ -507,7 +507,7 @@ svfloat16x4_t test_svread_hor_za16_f16_vg4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za16_s16_vg4(1, base);
 }
 
@@ -537,7 +537,7 @@ svint16x4_t test_svread_hor_za16_s16_vg4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_u16_vg4(1, base);
 }
 
@@ -567,7 +567,7 @@ svuint16x4_t test_svread_ver_za16_u16_vg4(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_bf16_vg4(1, base);
 }
 
@@ -597,7 +597,7 @@ svbfloat16x4_t test_svread_ver_za16_bf16_vg4(uint32_t base) __arm_streaming __ar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_f16_vg4(1, base);
 }
 
@@ -627,7 +627,7 @@ svfloat16x4_t test_svread_ver_za16_f16_vg4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za16_s16_vg4(1, base);
 }
 
@@ -649,7 +649,7 @@ svint16x4_t test_svread_ver_za16_s16_vg4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_u32_vg2(3, base);
 }
 
@@ -671,7 +671,7 @@ svuint32x2_t test_svread_hor_za32_u32_vg2(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_f32_vg2(3, base);
 }
 
@@ -693,7 +693,7 @@ svfloat32x2_t test_svread_hor_za32_f32_vg2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_s32_vg2(3, base);
 }
 
@@ -715,7 +715,7 @@ svint32x2_t test_svread_hor_za32_s32_vg2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_u32_vg2(3, base);
 }
 
@@ -737,7 +737,7 @@ svuint32x2_t test_svread_ver_za32_u32_vg2(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_f32_vg2(3, base);
 }
 
@@ -759,7 +759,7 @@ svfloat32x2_t test_svread_ver_za32_f32_vg2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_s32_vg2(3, base);
 }
 
@@ -789,7 +789,7 @@ svint32x2_t test_svread_ver_za32_s32_vg2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_u32_vg4(3, base);
 }
 
@@ -819,7 +819,7 @@ svuint32x4_t test_svread_hor_za32_u32_vg4(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_f32_vg4(3, base);
 }
 
@@ -849,7 +849,7 @@ svfloat32x4_t test_svread_hor_za32_f32_vg4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za32_s32_vg4(3, base);
 }
 
@@ -879,7 +879,7 @@ svint32x4_t test_svread_hor_za32_s32_vg4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_u32_vg4(3, base);
 }
 
@@ -909,7 +909,7 @@ svuint32x4_t test_svread_ver_za32_u32_vg4(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_f32_vg4(3, base);
 }
 
@@ -939,7 +939,7 @@ svfloat32x4_t test_svread_ver_za32_f32_vg4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za32_s32_vg4(3, base);
 }
 
@@ -961,7 +961,7 @@ svint32x4_t test_svread_ver_za32_s32_vg4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_u64_vg2(7, base);
 }
 
@@ -983,7 +983,7 @@ svuint64x2_t test_svread_hor_za64_u64_vg2(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_f64_vg2(7, base);
 }
 
@@ -1005,7 +1005,7 @@ svfloat64x2_t test_svread_hor_za64_f64_vg2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_s64_vg2(7, base);
 }
 
@@ -1027,7 +1027,7 @@ svint64x2_t test_svread_hor_za64_s64_vg2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_u64_vg2(7, base);
 }
 
@@ -1049,7 +1049,7 @@ svuint64x2_t test_svread_ver_za64_u64_vg2(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_f64_vg2(7, base);
 }
 
@@ -1071,7 +1071,7 @@ svfloat64x2_t test_svread_ver_za64_f64_vg2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_s64_vg2(7, base);
 }
 
@@ -1101,7 +1101,7 @@ svint64x2_t test_svread_ver_za64_s64_vg2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_u64_vg4(7, base);
 }
 
@@ -1131,7 +1131,7 @@ svuint64x4_t test_svread_hor_za64_u64_vg4(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_f64_vg4(7, base);
 }
 
@@ -1161,7 +1161,7 @@ svfloat64x4_t test_svread_hor_za64_f64_vg4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_hor_za64_s64_vg4(7, base);
 }
 
@@ -1191,7 +1191,7 @@ svint64x4_t test_svread_hor_za64_s64_vg4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_u64_vg4(7, base);
 }
 
@@ -1221,7 +1221,7 @@ svuint64x4_t test_svread_ver_za64_u64_vg4(uint32_t base) __arm_streaming __arm_s
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_f64_vg4(7, base);
 }
 
@@ -1251,7 +1251,7 @@ svfloat64x4_t test_svread_ver_za64_f64_vg4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_ver_za64_s64_vg4(7, base);
 }
 
@@ -1273,7 +1273,7 @@ svint64x4_t test_svread_ver_za64_s64_vg4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svread_za8_s8_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x2_t test_svread_za8_s8_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_s8_vg1x2(base);
 }
 
@@ -1295,7 +1295,7 @@ svint8x2_t test_svread_za8_s8_vg1x2(uint32_t base) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svread_za8_u8_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x2_t test_svread_za8_u8_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_u8_vg1x2(base);
 }
 
@@ -1317,7 +1317,7 @@ svuint8x2_t test_svread_za8_u8_vg1x2(uint32_t base) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svread_za16_s16_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x2_t test_svread_za16_s16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_s16_vg1x2(base);
 }
 
@@ -1339,7 +1339,7 @@ svint16x2_t test_svread_za16_s16_vg1x2(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svread_za16_u16_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x2_t test_svread_za16_u16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_u16_vg1x2(base);
 }
 
@@ -1361,7 +1361,7 @@ svuint16x2_t test_svread_za16_u16_vg1x2(uint32_t base) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svread_za16_bf16_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x2_t test_svread_za16_bf16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_bf16_vg1x2(base);
 }
 
@@ -1383,7 +1383,7 @@ svbfloat16x2_t test_svread_za16_bf16_vg1x2(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svread_za16_f16_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x2_t test_svread_za16_f16_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_f16_vg1x2(base);
 }
 
@@ -1405,7 +1405,7 @@ svfloat16x2_t test_svread_za16_f16_vg1x2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svread_za32_s32_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x2_t test_svread_za32_s32_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_s32_vg1x2(base);
 }
 
@@ -1427,7 +1427,7 @@ svint32x2_t test_svread_za32_s32_vg1x2(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svread_za32_u32_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x2_t test_svread_za32_u32_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_u32_vg1x2(base);
 }
 
@@ -1449,7 +1449,7 @@ svuint32x2_t test_svread_za32_u32_vg1x2(uint32_t base) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svread_za32_f32_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x2_t test_svread_za32_f32_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_f32_vg1x2(base);
 }
 
@@ -1471,7 +1471,7 @@ svfloat32x2_t test_svread_za32_f32_vg1x2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_u64_vg1x2(base);
 }
 
@@ -1493,7 +1493,7 @@ svuint64x2_t test_svread_za64_u64_vg1x2(uint32_t base) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_f64_vg1x2(base);
 }
 
@@ -1515,7 +1515,7 @@ svfloat64x2_t test_svread_za64_f64_vg1x2(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_s64_vg1x2(base);
 }
 
@@ -1545,7 +1545,7 @@ svint64x2_t test_svread_za64_s64_vg1x2(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svread_za8_s8_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint8x4_t test_svread_za8_s8_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_s8_vg1x4(base);
 }
 
@@ -1575,7 +1575,7 @@ svint8x4_t test_svread_za8_s8_vg1x4(uint32_t base) __arm_streaming __arm_shared_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svread_za8_u8_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint8x4_t test_svread_za8_u8_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za8_u8_vg1x4(base);
 }
 
@@ -1605,7 +1605,7 @@ svuint8x4_t test_svread_za8_u8_vg1x4(uint32_t base) __arm_streaming __arm_shared
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svread_za16_s16_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint16x4_t test_svread_za16_s16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_s16_vg1x4(base);
 }
 
@@ -1635,7 +1635,7 @@ svint16x4_t test_svread_za16_s16_vg1x4(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svread_za16_u16_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint16x4_t test_svread_za16_u16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_u16_vg1x4(base);
 }
 
@@ -1665,7 +1665,7 @@ svuint16x4_t test_svread_za16_u16_vg1x4(uint32_t base) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svread_za16_bf16_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svbfloat16x4_t test_svread_za16_bf16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_bf16_vg1x4(base);
 }
 
@@ -1695,7 +1695,7 @@ svbfloat16x4_t test_svread_za16_bf16_vg1x4(uint32_t base) __arm_streaming __arm_
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svread_za16_f16_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat16x4_t test_svread_za16_f16_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za16_f16_vg1x4(base);
 }
 
@@ -1725,7 +1725,7 @@ svfloat16x4_t test_svread_za16_f16_vg1x4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svread_za32_s32_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint32x4_t test_svread_za32_s32_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_s32_vg1x4(base);
 }
 
@@ -1755,7 +1755,7 @@ svint32x4_t test_svread_za32_s32_vg1x4(uint32_t base) __arm_streaming __arm_shar
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svread_za32_u32_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint32x4_t test_svread_za32_u32_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_u32_vg1x4(base);
 }
 
@@ -1785,7 +1785,7 @@ svuint32x4_t test_svread_za32_u32_vg1x4(uint32_t base) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svread_za32_f32_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat32x4_t test_svread_za32_f32_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za32_f32_vg1x4(base);
 }
 
@@ -1815,7 +1815,7 @@ svfloat32x4_t test_svread_za32_f32_vg1x4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_u64_vg1x4(base);
 }
 
@@ -1845,7 +1845,7 @@ svuint64x4_t test_svread_za64_u64_vg1x4(uint32_t base) __arm_streaming __arm_sha
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_f64_vg1x4(base);
 }
 
@@ -1875,6 +1875,6 @@ svfloat64x4_t test_svread_za64_f64_vg1x4(uint32_t base) __arm_streaming __arm_sh
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svread_za64_s64_vg1x4(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+svint64x4_t test_svread_za64_s64_vg1x4(uint32_t base) __arm_streaming __arm_in("za") {
   return svread_za64_s64_vg1x4(base);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
index 8c9a0a489ebf985..68913c5da4f75b5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
@@ -36,7 +36,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -54,7 +54,7 @@ void test_svsub_write_single2_s32(uint32_t slice_base, svint32x2_t zn, svint32_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -72,7 +72,7 @@ void test_svsub_write_single2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -90,7 +90,7 @@ void test_svsub_write_single2_s64(uint32_t slice_base, svint64x2_t zn, svint64_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -114,7 +114,7 @@ void test_svsub_write_single2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za32,_s32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -136,7 +136,7 @@ void test_svsub_write_single4_s32(uint32_t slice_base, svint32x4_t zn, svint32_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za32,_u32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -158,7 +158,7 @@ void test_svsub_write_single4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za64,_s64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -180,7 +180,7 @@ void test_svsub_write_single4_s64(uint32_t slice_base, svint64x4_t zn, svint64_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[ZM:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,_single,_za64,_u64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -208,7 +208,7 @@ void test_svsub_write_single4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -230,7 +230,7 @@ void test_svsub_write_multi2_s32(uint32_t slice_base, svint32x2_t zn, svint32x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -252,7 +252,7 @@ void test_svsub_write_multi2_u32(uint32_t slice_base, svuint32x2_t zn, svuint32x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -274,7 +274,7 @@ void test_svsub_write_multi2_s64(uint32_t slice_base, svint64x2_t zn, svint64x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x2_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x2)(slice_base, zn, zm);
 }
 
@@ -306,7 +306,7 @@ void test_svsub_write_multi2_u64(uint32_t slice_base, svuint64x2_t zn, svuint64x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za32,_s32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -336,7 +336,7 @@ void test_svsub_write_multi4_s32(uint32_t slice_base, svint32x4_t zn, svint32x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za32,_u32,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -366,7 +366,7 @@ void test_svsub_write_multi4_u32(uint32_t slice_base, svuint32x4_t zn, svuint32x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za64,_s64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -396,7 +396,7 @@ void test_svsub_write_multi4_s64(uint32_t slice_base, svint64x4_t zn, svint64x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.write.za.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_shared_za {
+void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x4_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_write,,_za64,_u64,_vg1x4)(slice_base, zn, zm);
 }
 
@@ -420,7 +420,7 @@ void test_svsub_write_multi4_u64(uint32_t slice_base, svuint64x4_t zn, svuint64x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x2)(slice_base, zn);
 }
 
@@ -438,7 +438,7 @@ void test_svsub_za32_vg1x2_f32(uint32_t slice_base, svfloat32x2_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x2)(slice_base , zn);
 }
 
@@ -456,7 +456,7 @@ void test_svsub_za32_vg1x2_s32(uint32_t slice_base, svint32x2_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za32.vg1x2.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x2)(slice_base, zn);
 }
 
@@ -474,7 +474,7 @@ void test_svsub_za32_vg1x2_u32(uint32_t slice_base, svuint32x2_t zn) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x2)(slice_base, zn);
 }
 
@@ -492,7 +492,7 @@ void test_svsub_za64_vg1x2_f64(uint32_t slice_base, svfloat64x2_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x2)(slice_base, zn);
 }
 
@@ -510,7 +510,7 @@ void test_svsub_za64_vg1x2_s64(uint32_t slice_base, svint64x2_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za64.vg1x2.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x2)(slice_base, zn);
 }
 
@@ -534,7 +534,7 @@ void test_svsub_za64_vg1x2_u64(uint32_t slice_base, svuint64x2_t zn) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za32,,_f32,,_vg1x4)(slice_base, zn);
 }
 
@@ -556,7 +556,7 @@ void test_svsub_za32_vg1x4_f32(uint32_t slice_base, svfloat32x4_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za32,,_s32,,_vg1x4)(slice_base, zn);
 }
 
@@ -578,7 +578,7 @@ void test_svsub_za32_vg1x4_s32(uint32_t slice_base, svint32x4_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za32.vg1x4.nxv4i32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za32,,_u32,,_vg1x4)(slice_base, zn);
 }
 
@@ -600,7 +600,7 @@ void test_svsub_za32_vg1x4_u32(uint32_t slice_base, svuint32x4_t zn) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za64,,_f64,,_vg1x4)(slice_base, zn);
 }
 
@@ -622,7 +622,7 @@ void test_svsub_za64_vg1x4_f64(uint32_t slice_base, svfloat64x4_t zn) __arm_stre
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za64,,_s64,,_vg1x4)(slice_base, zn);
 }
 
@@ -644,6 +644,6 @@ void test_svsub_za64_vg1x4_s64(uint32_t slice_base, svint64x4_t zn) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sub.za64.vg1x4.nxv2i64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_shared_za {
+void test_svsub_za64_vg1x4_u64(uint32_t slice_base, svuint64x4_t zn) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsub_za64,,_u64,,_vg1x4)(slice_base, zn);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
index fb313d4cebd72cf..9aa993f68b16318 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
@@ -28,7 +28,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za32,_bf16,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -46,7 +46,7 @@ void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, sv
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za32,_f16,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -64,7 +64,7 @@ void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfl
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za32,_s16,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -82,7 +82,7 @@ void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint1
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za32,_u16,_vg1x2)(slice_base, zn, zm, 3);
 }
 
@@ -104,7 +104,7 @@ void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za32,_s8,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -126,7 +126,7 @@ void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za32,_u8,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -148,7 +148,7 @@ void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za64,_s16,_vg1x4)(slice_base, zn, zm, 1);
 }
 
@@ -170,7 +170,7 @@ void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint1
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svvdot_lane_za64,_u16,_vg1x4)(slice_base, zn, zm, 1);
 }
 
@@ -193,7 +193,7 @@ void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svsuvdot_lane_za32,_s8,_vg1x4)(slice_base, zn, zm, 3);
 }
 
@@ -216,7 +216,7 @@ void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svusvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+void test_svusvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svusvdot_lane_za32,_u8,_vg1x4)(slice_base, zn, zm, 3);
 }
 

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
index 14b0371bce5742a..bf8e12b6b49691f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
@@ -30,7 +30,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8_vg2(uint32_t base, svuint8x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_u8_vg2(uint32_t base, svuint8x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg2,)(0, base, val);
 }
 
@@ -48,7 +48,7 @@ void test_svwrite_ver_za8_u8_vg2(uint32_t base, svuint8x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8_vg2(uint32_t base, svint8x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_s8_vg2(uint32_t base, svint8x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg2,)(0, base, val);
 }
 
@@ -66,7 +66,7 @@ void test_svwrite_ver_za8_s8_vg2(uint32_t base, svint8x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8_vg2(uint32_t base, svuint8x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_u8_vg2(uint32_t base, svuint8x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg2,)(0, base, val);
 }
 
@@ -84,7 +84,7 @@ void test_svwrite_hor_za8_u8_vg2(uint32_t base, svuint8x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8_vg2(uint32_t base, svint8x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_s8_vg2(uint32_t base, svint8x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg2,)(0, base, val);
 }
 
@@ -106,7 +106,7 @@ void test_svwrite_hor_za8_s8_vg2(uint32_t base, svint8x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8_vg4(uint32_t base, svuint8x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_u8_vg4(uint32_t base, svuint8x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za8,_u8,_vg4,)(0, base, val);
 }
 
@@ -128,7 +128,7 @@ void test_svwrite_hor_za8_u8_vg4(uint32_t base, svuint8x4_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8_vg4(uint32_t base, svint8x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za8_s8_vg4(uint32_t base, svint8x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za8,_s8,_vg4,)(0, base, val);
 }
 
@@ -150,7 +150,7 @@ void test_svwrite_hor_za8_s8_vg4(uint32_t base, svint8x4_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8_vg4(uint32_t base, svuint8x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_u8_vg4(uint32_t base, svuint8x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za8,_u8,_vg4,)(0, base, val);
 }
 
@@ -172,7 +172,7 @@ void test_svwrite_ver_za8_u8_vg4(uint32_t base, svuint8x4_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8_vg4(uint32_t base, svint8x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za8_s8_vg4(uint32_t base, svint8x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za8,_s8,_vg4,)(0, base, val);
 }
 
@@ -190,7 +190,7 @@ void test_svwrite_ver_za8_s8_vg4(uint32_t base, svint8x4_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16_vg2(uint32_t base, svuint16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_u16_vg2(uint32_t base, svuint16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg2,)(1, base, val);
 }
 
@@ -208,7 +208,7 @@ void test_svwrite_hor_za16_u16_vg2(uint32_t base, svuint16x2_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg2,)(1, base, val);
 }
 
@@ -226,7 +226,7 @@ void test_svwrite_hor_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) __arm_str
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16_vg2(uint32_t base, svfloat16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_f16_vg2(uint32_t base, svfloat16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg2,)(1, base, val);
 }
 
@@ -244,7 +244,7 @@ void test_svwrite_hor_za16_f16_vg2(uint32_t base, svfloat16x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16_vg2(uint32_t base, svint16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_s16_vg2(uint32_t base, svint16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg2,)(1, base, val);
 }
 
@@ -262,7 +262,7 @@ void test_svwrite_hor_za16_s16_vg2(uint32_t base, svint16x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16_vg2(uint32_t base, svuint16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_u16_vg2(uint32_t base, svuint16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg2,)(1, base, val);
 }
 
@@ -280,7 +280,7 @@ void test_svwrite_ver_za16_u16_vg2(uint32_t base, svuint16x2_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg2,)(1, base, val);
 }
 
@@ -298,7 +298,7 @@ void test_svwrite_ver_za16_bf16_vg2(uint32_t base, svbfloat16x2_t val) __arm_str
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16_vg2(uint32_t base, svfloat16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_f16_vg2(uint32_t base, svfloat16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg2,)(1, base, val);
 }
 
@@ -316,7 +316,7 @@ void test_svwrite_ver_za16_f16_vg2(uint32_t base, svfloat16x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16_vg2(uint32_t base, svint16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_s16_vg2(uint32_t base, svint16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg2,)(1, base, val);
 }
 
@@ -338,7 +338,7 @@ void test_svwrite_ver_za16_s16_vg2(uint32_t base, svint16x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16_vg4(uint32_t base, svuint16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_u16_vg4(uint32_t base, svuint16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_u16,_vg4,)(1, base, val);
 }
 
@@ -360,7 +360,7 @@ void test_svwrite_hor_za16_u16_vg4(uint32_t base, svuint16x4_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_bf16,_vg4,)(1, base, val);
 }
 
@@ -382,7 +382,7 @@ void test_svwrite_hor_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) __arm_str
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16_vg4(uint32_t base, svfloat16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_f16_vg4(uint32_t base, svfloat16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_f16,_vg4,)(1, base, val);
 }
 
@@ -404,7 +404,7 @@ void test_svwrite_hor_za16_f16_vg4(uint32_t base, svfloat16x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16_vg4(uint32_t base, svint16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za16_s16_vg4(uint32_t base, svint16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za16,_s16,_vg4,)(1, base, val);
 }
 
@@ -426,7 +426,7 @@ void test_svwrite_hor_za16_s16_vg4(uint32_t base, svint16x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16_vg4(uint32_t base, svuint16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_u16_vg4(uint32_t base, svuint16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_u16,_vg4,)(1, base, val);
 }
 
@@ -448,7 +448,7 @@ void test_svwrite_ver_za16_u16_vg4(uint32_t base, svuint16x4_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_bf16,_vg4,)(1, base, val);
 }
 
@@ -470,7 +470,7 @@ void test_svwrite_ver_za16_bf16_vg4(uint32_t base, svbfloat16x4_t val) __arm_str
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16_vg4(uint32_t base, svfloat16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_f16_vg4(uint32_t base, svfloat16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_f16,_vg4,)(1, base, val);
 }
 
@@ -492,7 +492,7 @@ void test_svwrite_ver_za16_f16_vg4(uint32_t base, svfloat16x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16_vg4(uint32_t base, svint16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za16_s16_vg4(uint32_t base, svint16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za16,_s16,_vg4,)(1, base, val);
 }
 
@@ -510,7 +510,7 @@ void test_svwrite_ver_za16_s16_vg4(uint32_t base, svint16x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32_vg2(uint32_t base, svuint32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_u32_vg2(uint32_t base, svuint32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg2,)(3, base, val);
 }
 
@@ -528,7 +528,7 @@ void test_svwrite_hor_za32_u32_vg2(uint32_t base, svuint32x2_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32_vg2(uint32_t base, svfloat32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_f32_vg2(uint32_t base, svfloat32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg2,)(3, base, val);
 }
 
@@ -546,7 +546,7 @@ void test_svwrite_hor_za32_f32_vg2(uint32_t base, svfloat32x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32_vg2(uint32_t base, svint32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_s32_vg2(uint32_t base, svint32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg2,)(3, base, val);
 }
 
@@ -564,7 +564,7 @@ void test_svwrite_hor_za32_s32_vg2(uint32_t base, svint32x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32_vg2(uint32_t base, svuint32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_u32_vg2(uint32_t base, svuint32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg2,)(3, base, val);
 }
 
@@ -582,7 +582,7 @@ void test_svwrite_ver_za32_u32_vg2(uint32_t base, svuint32x2_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32_vg2(uint32_t base, svfloat32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_f32_vg2(uint32_t base, svfloat32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg2,)(3, base, val);
 }
 
@@ -600,7 +600,7 @@ void test_svwrite_ver_za32_f32_vg2(uint32_t base, svfloat32x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32_vg2(uint32_t base, svint32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_s32_vg2(uint32_t base, svint32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg2,)(3, base, val);
 }
 
@@ -622,7 +622,7 @@ void test_svwrite_ver_za32_s32_vg2(uint32_t base, svint32x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32_vg4(uint32_t base, svuint32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_u32_vg4(uint32_t base, svuint32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za32,_u32,_vg4,)(3, base, val);
 }
 
@@ -644,7 +644,7 @@ void test_svwrite_hor_za32_u32_vg4(uint32_t base, svuint32x4_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32_vg4(uint32_t base, svfloat32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_f32_vg4(uint32_t base, svfloat32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za32,_f32,_vg4,)(3, base, val);
 }
 
@@ -666,7 +666,7 @@ void test_svwrite_hor_za32_f32_vg4(uint32_t base, svfloat32x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32_vg4(uint32_t base, svint32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za32_s32_vg4(uint32_t base, svint32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za32,_s32,_vg4,)(3, base, val);
 }
 
@@ -688,7 +688,7 @@ void test_svwrite_hor_za32_s32_vg4(uint32_t base, svint32x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32_vg4(uint32_t base, svuint32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_u32_vg4(uint32_t base, svuint32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za32,_u32,_vg4,)(3, base, val);
 }
 
@@ -710,7 +710,7 @@ void test_svwrite_ver_za32_u32_vg4(uint32_t base, svuint32x4_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32_vg4(uint32_t base, svfloat32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_f32_vg4(uint32_t base, svfloat32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za32,_f32,_vg4,)(3, base, val);
 }
 
@@ -732,7 +732,7 @@ void test_svwrite_ver_za32_f32_vg4(uint32_t base, svfloat32x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 3, i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32_vg4(uint32_t base, svint32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za32_s32_vg4(uint32_t base, svint32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za32,_s32,_vg4,)(3, base, val);
 }
 
@@ -750,7 +750,7 @@ void test_svwrite_ver_za32_s32_vg4(uint32_t base, svint32x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64_vg2(uint32_t base, svuint64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_u64_vg2(uint32_t base, svuint64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg2,)(7, base, val);
 }
 
@@ -768,7 +768,7 @@ void test_svwrite_hor_za64_u64_vg2(uint32_t base, svuint64x2_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64_vg2(uint32_t base, svfloat64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_f64_vg2(uint32_t base, svfloat64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg2,)(7, base, val);
 }
 
@@ -786,7 +786,7 @@ void test_svwrite_hor_za64_f64_vg2(uint32_t base, svfloat64x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64_vg2(uint32_t base, svint64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_s64_vg2(uint32_t base, svint64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg2,)(7, base, val);
 }
 
@@ -804,7 +804,7 @@ void test_svwrite_hor_za64_s64_vg2(uint32_t base, svint64x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64_vg2(uint32_t base, svuint64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_u64_vg2(uint32_t base, svuint64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg2,)(7, base, val);
 }
 
@@ -822,7 +822,7 @@ void test_svwrite_ver_za64_u64_vg2(uint32_t base, svuint64x2_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64_vg2(uint32_t base, svfloat64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_f64_vg2(uint32_t base, svfloat64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg2,)(7, base, val);
 }
 
@@ -840,7 +840,7 @@ void test_svwrite_ver_za64_f64_vg2(uint32_t base, svfloat64x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64_vg2(uint32_t base, svint64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_s64_vg2(uint32_t base, svint64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg2,)(7, base, val);
 }
 
@@ -862,7 +862,7 @@ void test_svwrite_ver_za64_s64_vg2(uint32_t base, svint64x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64_vg4(uint32_t base, svuint64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_u64_vg4(uint32_t base, svuint64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za64,_u64,_vg4,)(7, base, val);
 }
 
@@ -884,7 +884,7 @@ void test_svwrite_hor_za64_u64_vg4(uint32_t base, svuint64x4_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64_vg4(uint32_t base, svuint64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_u64_vg4(uint32_t base, svuint64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za64,_u64,_vg4,)(7, base, val);
 }
 
@@ -906,7 +906,7 @@ void test_svwrite_ver_za64_u64_vg4(uint32_t base, svuint64x4_t val) __arm_stream
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64_vg4(uint32_t base, svfloat64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_f64_vg4(uint32_t base, svfloat64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za64,_f64,_vg4,)(7, base, val);
 }
 
@@ -928,7 +928,7 @@ void test_svwrite_hor_za64_f64_vg4(uint32_t base, svfloat64x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64_vg4(uint32_t base, svint64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_hor_za64_s64_vg4(uint32_t base, svint64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_hor_za64,_s64,_vg4,)(7, base, val);
 }
 
@@ -950,7 +950,7 @@ void test_svwrite_hor_za64_s64_vg4(uint32_t base, svint64x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64_vg4(uint32_t base, svfloat64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_f64_vg4(uint32_t base, svfloat64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za64,_f64,_vg4,)(7, base, val);
 }
 
@@ -972,7 +972,7 @@ void test_svwrite_ver_za64_f64_vg4(uint32_t base, svfloat64x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 7, i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64_vg4(uint32_t base, svint64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_ver_za64_s64_vg4(uint32_t base, svint64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_ver_za64,_s64,_vg4,)(7, base, val);
 }
 
@@ -990,7 +990,7 @@ void test_svwrite_ver_za64_s64_vg4(uint32_t base, svint64x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za8_s8_vg1x2(uint32_t base, svint8x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za8_s8_vg1x2(uint32_t base, svint8x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za8,_s8,_vg1x2,)(base, val);
 }
 
@@ -1008,7 +1008,7 @@ void test_svwrite_za8_s8_vg1x2(uint32_t base, svint8x2_t val) __arm_streaming __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za8_u8_vg1x2(uint32_t base, svuint8x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za8_u8_vg1x2(uint32_t base, svuint8x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za8,_u8,_vg1x2,)(base, val);
 }
 
@@ -1026,7 +1026,7 @@ void test_svwrite_za8_u8_vg1x2(uint32_t base, svuint8x2_t val) __arm_streaming _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_s16_vg1x2(uint32_t base, svint16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_s16_vg1x2(uint32_t base, svint16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_s16,_vg1x2,)(base, val);
 }
 
@@ -1044,7 +1044,7 @@ void test_svwrite_za16_s16_vg1x2(uint32_t base, svint16x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_u16_vg1x2(uint32_t base, svuint16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_u16_vg1x2(uint32_t base, svuint16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_u16,_vg1x2,)(base, val);
 }
 
@@ -1062,7 +1062,7 @@ void test_svwrite_za16_u16_vg1x2(uint32_t base, svuint16x2_t val) __arm_streamin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 [[BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_bf16_vg1x2(uint32_t base, svbfloat16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_bf16_vg1x2(uint32_t base, svbfloat16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_bf16,_vg1x2,)(base, val);
 }
 
@@ -1080,7 +1080,7 @@ void test_svwrite_za16_bf16_vg1x2(uint32_t base, svbfloat16x2_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 [[BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_f16_vg1x2(uint32_t base, svfloat16x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_f16_vg1x2(uint32_t base, svfloat16x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_f16,_vg1x2,)(base, val);
 }
 
@@ -1098,7 +1098,7 @@ void test_svwrite_za16_f16_vg1x2(uint32_t base, svfloat16x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za32_s32_vg1x2(uint32_t base, svint32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za32_s32_vg1x2(uint32_t base, svint32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za32,_s32,_vg1x2,)(base, val);
 }
 
@@ -1116,7 +1116,7 @@ void test_svwrite_za32_s32_vg1x2(uint32_t base, svint32x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za32_u32_vg1x2(uint32_t base, svuint32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za32_u32_vg1x2(uint32_t base, svuint32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za32,_u32,_vg1x2,)(base, val);
 }
 
@@ -1134,7 +1134,7 @@ void test_svwrite_za32_u32_vg1x2(uint32_t base, svuint32x2_t val) __arm_streamin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 [[BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za32_f32_vg1x2(uint32_t base, svfloat32x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za32_f32_vg1x2(uint32_t base, svfloat32x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za32,_f32,_vg1x2,)(base, val);
 }
 
@@ -1152,7 +1152,7 @@ void test_svwrite_za32_f32_vg1x2(uint32_t base, svfloat32x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za64_u64_vg1x2(uint32_t base, svuint64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za64_u64_vg1x2(uint32_t base, svuint64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x2,)(base, val);
 }
 
@@ -1170,7 +1170,7 @@ void test_svwrite_za64_u64_vg1x2(uint32_t base, svuint64x2_t val) __arm_streamin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 [[BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za64_f64_vg1x2(uint32_t base, svfloat64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za64_f64_vg1x2(uint32_t base, svfloat64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x2,)(base, val);
 }
 
@@ -1188,7 +1188,7 @@ void test_svwrite_za64_f64_vg1x2(uint32_t base, svfloat64x2_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za64_s64_vg1x2(uint32_t base, svint64x2_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za64_s64_vg1x2(uint32_t base, svint64x2_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x2,)(base, val);
 }
 
@@ -1210,7 +1210,7 @@ void test_svwrite_za64_s64_vg1x2(uint32_t base, svint64x2_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za8_s8_vg1x4(uint32_t base, svint8x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za8_s8_vg1x4(uint32_t base, svint8x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za8,_s8,_vg1x4,)(base, val);
 }
 
@@ -1232,7 +1232,7 @@ void test_svwrite_za8_s8_vg1x4(uint32_t base, svint8x4_t val) __arm_streaming __
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 [[BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za8_u8_vg1x4(uint32_t base, svuint8x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za8_u8_vg1x4(uint32_t base, svuint8x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za8,_u8,_vg1x4,)(base, val);
 }
 
@@ -1254,7 +1254,7 @@ void test_svwrite_za8_u8_vg1x4(uint32_t base, svuint8x4_t val) __arm_streaming _
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_s16_vg1x4(uint32_t base, svint16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_s16_vg1x4(uint32_t base, svint16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_s16,_vg1x4,)(base, val);
 }
 
@@ -1276,7 +1276,7 @@ void test_svwrite_za16_s16_vg1x4(uint32_t base, svint16x4_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 [[BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_u16_vg1x4(uint32_t base, svuint16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_u16_vg1x4(uint32_t base, svuint16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_u16,_vg1x4,)(base, val);
 }
 
@@ -1298,7 +1298,7 @@ void test_svwrite_za16_u16_vg1x4(uint32_t base, svuint16x4_t val) __arm_streamin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 [[BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_bf16_vg1x4(uint32_t base, svbfloat16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_bf16_vg1x4(uint32_t base, svbfloat16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_bf16,_vg1x4,)(base, val);
 }
 
@@ -1320,7 +1320,7 @@ void test_svwrite_za16_bf16_vg1x4(uint32_t base, svbfloat16x4_t val) __arm_strea
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 [[BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za16_f16_vg1x4(uint32_t base, svfloat16x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za16_f16_vg1x4(uint32_t base, svfloat16x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za16,_f16,_vg1x4,)(base, val);
 }
 
@@ -1342,7 +1342,7 @@ void test_svwrite_za16_f16_vg1x4(uint32_t base, svfloat16x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za32_s32_vg1x4(uint32_t base, svint32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za32_s32_vg1x4(uint32_t base, svint32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za32,_s32,_vg1x4,)(base, val);
 }
 
@@ -1364,7 +1364,7 @@ void test_svwrite_za32_s32_vg1x4(uint32_t base, svint32x4_t val) __arm_streaming
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 [[BASE:%.*]], <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za32_u32_vg1x4(uint32_t base, svuint32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za32_u32_vg1x4(uint32_t base, svuint32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za32,_u32,_vg1x4,)(base, val);
 }
 
@@ -1386,7 +1386,7 @@ void test_svwrite_za32_u32_vg1x4(uint32_t base, svuint32x4_t val) __arm_streamin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 [[BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za32_f32_vg1x4(uint32_t base, svfloat32x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za32_f32_vg1x4(uint32_t base, svfloat32x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za32,_f32,_vg1x4,)(base, val);
 }
 
@@ -1408,7 +1408,7 @@ void test_svwrite_za32_f32_vg1x4(uint32_t base, svfloat32x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za64_u64_vg1x4(uint32_t base, svuint64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za64_u64_vg1x4(uint32_t base, svuint64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za64,_u64,_vg1x4,)(base, val);
 }
 
@@ -1430,7 +1430,7 @@ void test_svwrite_za64_u64_vg1x4(uint32_t base, svuint64x4_t val) __arm_streamin
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 [[BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za64_f64_vg1x4(uint32_t base, svfloat64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za64_f64_vg1x4(uint32_t base, svfloat64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za64,_f64,_vg1x4,)(base, val);
 }
 
@@ -1452,6 +1452,6 @@ void test_svwrite_za64_f64_vg1x4(uint32_t base, svfloat64x4_t val) __arm_streami
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 [[BASE:%.*]], <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svwrite_za64_s64_vg1x4(uint32_t base, svint64x4_t val) __arm_streaming __arm_shared_za {
+void test_svwrite_za64_s64_vg1x4(uint32_t base, svint64x4_t val) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svwrite_za64,_s64,_vg1x4,)(base, val);
 }

diff  --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
index 31e8d6850fb289d..081b1a1d2627cef 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
@@ -18,6 +18,6 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.zt(i32 0)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svzero_zt(void) __arm_streaming_compatible __arm_shared_za {
+void test_svzero_zt(void) __arm_streaming_compatible __arm_out("za") {
   svzero_zt(0);
 }

diff  --git a/clang/test/Modules/aarch64-sme-keywords.cppm b/clang/test/Modules/aarch64-sme-keywords.cppm
index 6784aaa01d21983..df4dd32b16cff7c 100644
--- a/clang/test/Modules/aarch64-sme-keywords.cppm
+++ b/clang/test/Modules/aarch64-sme-keywords.cppm
@@ -13,8 +13,8 @@ export module A;
 
 export void f_streaming(void) __arm_streaming { }
 export void f_streaming_compatible(void) __arm_streaming_compatible { }
-export void f_shared_za(void) __arm_shared_za { }
-export void f_preserves_za(void) __arm_preserves_za { }
+export void f_shared_za(void) __arm_inout("za") { }
+export void f_preserves_za(void) __arm_preserves("za") { }
 
 //--- Use.cpp
 // expected-no-diagnostics
@@ -50,11 +50,11 @@ import A;
 // CHECK-DAG: attributes #[[STREAMING_DECL]] = {{{.*}} "aarch64_pstate_sm_enabled" {{.*}}}
 // CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_DECL]] = {{{.*}} "aarch64_pstate_sm_compatible" {{.*}}}
 // CHECK-DAG: attributes #[[SHARED_ZA_USE]] = { "aarch64_pstate_za_shared" }
-// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_pstate_za_preserved" }
+// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_pstate_za_preserved" "aarch64_pstate_za_shared" }
 // CHECK-DAG: attributes #[[STREAMING_USE]] = { "aarch64_pstate_sm_enabled" }
 // CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_USE]] = { "aarch64_pstate_sm_compatible" }
 
-void f_shared_za_caller(void) __arm_shared_za {
+void f_shared_za_caller(void) __arm_inout("za") {
   f_shared_za();
   f_preserves_za();
 }

diff  --git a/clang/test/Parser/c2x-attribute-keywords.c b/clang/test/Parser/c2x-attribute-keywords.c
index d8291b710e6db64..b88d2b9c23e697a 100644
--- a/clang/test/Parser/c2x-attribute-keywords.c
+++ b/clang/test/Parser/c2x-attribute-keywords.c
@@ -1,60 +1,64 @@
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %s
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %s
-
-enum __arm_streaming E { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  One __arm_streaming, // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+// RUN: sed -e "s at ATTR_USE@__arm_streaming at g" -e "s at ATTR_NAME@__arm_streaming at g" %s > %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %t
+// RUN: sed -e "s at ATTR_USE@__arm_inout\(\"za\"\)@g" -e "s at ATTR_NAME@__arm_inout at g" %s > %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %t
+
+enum ATTR_USE E { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  One ATTR_USE, // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
   Two,
-  Three __arm_streaming // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+  Three ATTR_USE // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 };
 
-enum __arm_streaming { Four }; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-__arm_streaming enum E2 { Five }; // expected-error {{misplaced '__arm_streaming'}}
+enum ATTR_USE { Four }; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+ATTR_USE enum E2 { Five }; // expected-error {{misplaced 'ATTR_NAME'}}
 
 // FIXME: this diagnostic can be improved.
-enum { __arm_streaming Six }; // expected-error {{expected identifier}}
+enum { ATTR_USE Six }; // expected-error {{expected identifier}}
 
 // FIXME: this diagnostic can be improved.
-enum E3 __arm_streaming { Seven }; // expected-error {{expected identifier or '('}}
-
-struct __arm_streaming S1 { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  int i __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-  int __arm_streaming j; // expected-error {{'__arm_streaming' only applies to function types}}
-  int k[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-  int l __arm_streaming[10]; // expected-error {{'__arm_streaming' only applies to function types}}
-  __arm_streaming int m, n; // expected-error {{'__arm_streaming' only applies to function types}}
-  int o __arm_streaming : 12; // expected-error {{'__arm_streaming' only applies to function types}}
-  int __arm_streaming : 0; // expected-error {{'__arm_streaming' only applies to function types}}
-  int p, __arm_streaming : 0; // expected-error {{'__arm_streaming' cannot appear here}}
-  int q, __arm_streaming r; // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming int; // expected-error {{'__arm_streaming' cannot appear here}} \
+enum E3 ATTR_USE { Seven }; // expected-error {{expected identifier or '('}}
+
+struct ATTR_USE S1 { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  int i ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int ATTR_USE j; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int k[10] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int l ATTR_USE[10]; // expected-error {{'ATTR_NAME' only applies to function types}}
+  ATTR_USE int m, n; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int o ATTR_USE : 12; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int ATTR_USE : 0; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int p, ATTR_USE : 0; // expected-error {{'ATTR_NAME' cannot appear here}}
+  int q, ATTR_USE r; // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE int; // expected-error {{'ATTR_NAME' cannot appear here}} \
             // expected-warning {{declaration does not declare anything}}
 };
 
-__arm_streaming struct S2 { int a; }; // expected-error {{misplaced '__arm_streaming'}}
-struct S3 __arm_streaming { int a; }; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                         expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+ATTR_USE struct S2 { int a; }; // expected-error {{misplaced 'ATTR_NAME'}}
+struct S3 ATTR_USE { int a; }; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                         expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-union __arm_streaming U { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  double d __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types; type here is 'double'}}
-  __arm_streaming int i; // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}}
+union ATTR_USE U { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  double d ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types; type here is 'double'}}
+  ATTR_USE int i; // expected-error {{'ATTR_NAME' only applies to function types; type here is 'int'}}
 };
 
-__arm_streaming union U2 { double d; }; // expected-error {{misplaced '__arm_streaming'}}
-union U3 __arm_streaming { double d; }; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                           expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+ATTR_USE union U2 { double d; }; // expected-error {{misplaced 'ATTR_NAME'}}
+union U3 ATTR_USE { double d; }; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                           expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-struct __arm_streaming IncompleteStruct; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-union __arm_streaming IncompleteUnion; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum __arm_streaming IncompleteEnum; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+struct ATTR_USE IncompleteStruct; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+union ATTR_USE IncompleteUnion; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE IncompleteEnum; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-__arm_streaming void f1(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-void __arm_streaming f2(void); // expected-error {{'__arm_streaming' only applies to function types}}
-void f3 __arm_streaming (void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-void f4(void) __arm_streaming;
+ATTR_USE void f1(void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+void ATTR_USE f2(void); // expected-error {{'ATTR_NAME' only applies to function types}}
+void f3 ATTR_USE (void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+void f4(void) ATTR_USE;
 
-void f5(int i __arm_streaming, __arm_streaming int j, int __arm_streaming k); // expected-error 3 {{'__arm_streaming' only applies to function types}}
+void f5(int i ATTR_USE, ATTR_USE int j, int ATTR_USE k); // expected-error 3 {{'ATTR_NAME' only applies to function types}}
 
-void f6(a, b) __arm_streaming int a; int b; { // expected-error {{'__arm_streaming' cannot appear here}} \
+void f6(a, b) ATTR_USE int a; int b; { // expected-error {{'ATTR_NAME' cannot appear here}} \
                                                  c2x-warning {{deprecated}}
 }
 
@@ -63,57 +67,74 @@ void f6(a, b) __arm_streaming int a; int b; { // expected-error {{'__arm_streami
 // behavior given that we *don't* want to parse it as part of the K&R parameter
 // declarations. It is disallowed to avoid a parsing ambiguity we already
 // handle well.
-int (*f7(a, b))(int, int) __arm_streaming int a; int b; { // c2x-warning {{deprecated}}
+int (*f7(a, b))(int, int) ATTR_USE int a; int b; { // c2x-warning {{deprecated}}
   return 0;
 }
 
-__arm_streaming int a, b; // expected-error {{'__arm_streaming' only applies to function types}}
-int c __arm_streaming, d __arm_streaming; // expected-error 2 {{'__arm_streaming' only applies to function types}}
+ATTR_USE int a, b; // expected-error {{'ATTR_NAME' only applies to function types}}
+int c ATTR_USE, d ATTR_USE; // expected-error 2 {{'ATTR_NAME' only applies to function types}}
 
-void f8(void) __arm_streaming {
-  __arm_streaming int i, j; // expected-error {{'__arm_streaming' only applies to function types}}
-  int k, l __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
+void f8(void) ATTR_USE {
+  ATTR_USE int i, j; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int k, l ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
 }
 
-__arm_streaming void f9(void) { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  int i[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-  int (*fp1)(void)__arm_streaming;
-  int (*fp2 __arm_streaming)(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+ATTR_USE void f9(void) { // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  int i[10] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int (*fp1)(void)ATTR_USE;
+  int (*fp2 ATTR_USE)(void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
 
-  int * __arm_streaming *ipp; // expected-error {{'__arm_streaming' only applies to function types}}
+  int * ATTR_USE *ipp; // expected-error {{'ATTR_NAME' only applies to function types}}
 }
 
-void f10(int j[static 10] __arm_streaming, int k[*] __arm_streaming); // expected-error 2 {{'__arm_streaming' only applies to function types}}
+void f10(int j[static 10] ATTR_USE, int k[*] ATTR_USE); // expected-error 2 {{'ATTR_NAME' only applies to function types}}
 
 void f11(void) {
-  __arm_streaming {} // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming if (1) {} // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE {} // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE if (1) {} // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming switch (1) { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming case 1: __arm_streaming break; // expected-error 2 {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming default: break; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE switch (1) { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE case 1: ATTR_USE break; // expected-error 2 {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE default: break; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
   }
 
   goto foo;
-  __arm_streaming foo: (void)1; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+  ATTR_USE foo: (void)1; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-  __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming while (1); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming do __arm_streaming { } while(1); // expected-error 2 {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE for (;;); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE while (1); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE do ATTR_USE { } while(1); // expected-error 2 {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming (void)1; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE (void)1; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  (void)sizeof(int [4]__arm_streaming); // expected-error {{'__arm_streaming' only applies to function types}}
-  (void)sizeof(struct __arm_streaming S3 { int a __arm_streaming; }); // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} \
-                                                                      // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}}
+  (void)sizeof(int [4]ATTR_USE); // expected-error {{'ATTR_NAME' only applies to function types}}
+  (void)sizeof(struct ATTR_USE S3 { int a ATTR_USE; }); // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} \
+                                                                      // expected-error {{'ATTR_NAME' only applies to function types; type here is 'int'}}
 
-  __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE return; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming asm (""); // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE asm (""); // expected-error {{'ATTR_NAME' cannot appear here}}
 }
 
-struct __arm_streaming S4 *s; // expected-error {{'__arm_streaming' cannot appear here}}
+struct ATTR_USE S4 *s; // expected-error {{'ATTR_NAME' cannot appear here}}
 struct S5 {};
-int c = sizeof(struct __arm_streaming S5); // expected-error {{'__arm_streaming' cannot appear here}}
+int c = sizeof(struct ATTR_USE S5); // expected-error {{'ATTR_NAME' cannot appear here}}
+
+void invalid_parentheses1() __arm_inout; // expected-error {{expected '(' after ''__arm_inout''}}
+void invalid_parentheses2() __arm_inout(; // expected-error {{expected string literal as argument of '__arm_inout' attribute}}
+void invalid_parentheses3() __arm_inout((); // expected-error {{expected string literal as argument of '__arm_inout' attribute}}
+void invalid_parentheses4() __arm_inout); // expected-error {{expected '(' after ''__arm_inout''}} \
+                                          // expected-error {{expected function body after function declarator}}
+void invalid_parentheses5() __arm_inout(());  // expected-error {{expected string literal as argument of '__arm_inout' attribute}}
+void invalid_parentheses6() __arm_inout("za"; // expected-error {{expected ')'}}
+void invalid_parentheses7() __arm_streaming(; // expected-error {{expected parameter declarator}} \
+                                              // expected-error {{expected ')'}} \
+                                              // expected-note {{to match this '('}} \
+                                              // expected-error {{function cannot return function type 'void ()'}} \
+                                              // expected-error {{'__arm_streaming' only applies to function types; type here is 'int ()'}} \
+                                              // expected-warning {{'__arm_streaming' only applies to non-K&R-style functions}}
+void invalid_parentheses8() __arm_streaming();  // expected-error {{function cannot return function type 'void ()'}} \
+                                                // expected-error {{'__arm_streaming' only applies to function types; type here is 'int ()'}} \
+                                                // expected-warning {{'__arm_streaming' only applies to non-K&R-style functions}}

diff  --git a/clang/test/Parser/c2x-attribute-keywords.m b/clang/test/Parser/c2x-attribute-keywords.m
index 2296be13cb714cc..575c88ffffc3d66 100644
--- a/clang/test/Parser/c2x-attribute-keywords.m
+++ b/clang/test/Parser/c2x-attribute-keywords.m
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify %s
 
-enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+enum __arm_inout("za") E1 : int; // expected-error {{'__arm_inout' only applies to non-K&R-style functions}}
 
 @interface Base
 @end
@@ -15,5 +15,5 @@ - (S *) foo;
 
 
 void f(T *t) {
-  __arm_streaming[[t foo] bar]; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  __arm_inout("za")[[t foo] bar]; // expected-error {{'__arm_inout' cannot be applied to a statement}}
 }

diff  --git a/clang/test/Parser/cxx0x-keyword-attributes.cpp b/clang/test/Parser/cxx0x-keyword-attributes.cpp
index 8d31efac5320802..be7423cc7ecee36 100644
--- a/clang/test/Parser/cxx0x-keyword-attributes.cpp
+++ b/clang/test/Parser/cxx0x-keyword-attributes.cpp
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme %s
+// RUN: sed -e "s at ATTR_USE@__arm_streaming at g" -e "s at ATTR_NAME@__arm_streaming at g" %s > %t
+// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme -x c++ %t
+// RUN: sed -e "s at ATTR_USE@__arm_inout\(\"za\"\)@g" -e "s at ATTR_NAME@__arm_inout at g" %s > %t
+// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme -x c++ %t
 
 // Need std::initializer_list
 namespace std {
@@ -35,136 +38,136 @@ namespace std {
 
 
 // Declaration syntax checks
-__arm_streaming int before_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-int __arm_streaming between_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-const __arm_streaming int between_attr_2 = 0; // expected-error {{'__arm_streaming' cannot appear here}}
-int after_attr __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-int * __arm_streaming ptr_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-int & __arm_streaming ref_attr = after_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-int && __arm_streaming rref_attr = 0; // expected-error {{'__arm_streaming' only applies to function types}}
-int array_attr [1] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-void fn_attr () __arm_streaming;
-void noexcept_fn_attr () noexcept __arm_streaming;
+ATTR_USE int before_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+int ATTR_USE between_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+const ATTR_USE int between_attr_2 = 0; // expected-error {{'ATTR_NAME' cannot appear here}}
+int after_attr ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+int * ATTR_USE ptr_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+int & ATTR_USE ref_attr = after_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+int && ATTR_USE rref_attr = 0; // expected-error {{'ATTR_NAME' only applies to function types}}
+int array_attr [1] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+void fn_attr () ATTR_USE;
+void noexcept_fn_attr () noexcept ATTR_USE;
 struct MemberFnOrder {
-  virtual void f() const volatile && noexcept __arm_streaming final = 0;
+  virtual void f() const volatile && noexcept ATTR_USE final = 0;
 };
-struct __arm_streaming struct_attr; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-class __arm_streaming class_attr {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-union __arm_streaming union_attr; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum __arm_streaming E { }; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+struct ATTR_USE struct_attr; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+class ATTR_USE class_attr {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+union ATTR_USE union_attr; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E { }; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 namespace test_misplacement {
-__arm_streaming struct struct_attr2;  // expected-error {{misplaced '__arm_streaming'}}
-__arm_streaming class class_attr2; // expected-error {{misplaced '__arm_streaming'}}
-__arm_streaming union union_attr2; // expected-error {{misplaced '__arm_streaming'}}
-__arm_streaming enum  E2 { }; // expected-error {{misplaced '__arm_streaming'}}
+ATTR_USE struct struct_attr2;  // expected-error {{misplaced 'ATTR_NAME'}}
+ATTR_USE class class_attr2; // expected-error {{misplaced 'ATTR_NAME'}}
+ATTR_USE union union_attr2; // expected-error {{misplaced 'ATTR_NAME'}}
+ATTR_USE enum  E2 { }; // expected-error {{misplaced 'ATTR_NAME'}}
 }
 
 // Checks attributes placed at wrong syntactic locations of class specifiers.
-class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
-  attr_after_class_name_decl __arm_streaming __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                                 expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
+class ATTR_USE ATTR_USE // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  attr_after_class_name_decl ATTR_USE ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                                 expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
- attr_after_class_name_definition __arm_streaming __arm_streaming __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                                                        expected-error 3 {{'__arm_streaming' only applies to non-K&R-style functions}}
+class ATTR_USE ATTR_USE // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+ attr_after_class_name_definition ATTR_USE ATTR_USE ATTR_USE{}; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                                                        expected-error 3 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-class __arm_streaming c {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-class c __arm_streaming __arm_streaming x; // expected-error 2 {{'__arm_streaming' only applies to function types}}
-class c __arm_streaming __arm_streaming y __arm_streaming __arm_streaming; // expected-error 4 {{'__arm_streaming' only applies to function types}}
+class ATTR_USE c {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+class c ATTR_USE ATTR_USE x; // expected-error 2 {{'ATTR_NAME' only applies to function types}}
+class c ATTR_USE ATTR_USE y ATTR_USE ATTR_USE; // expected-error 4 {{'ATTR_NAME' only applies to function types}}
 class c final [(int){0}];
 
 class base {};
-class __arm_streaming __arm_streaming final_class // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
-  __arm_streaming alignas(float) final // expected-error {{'__arm_streaming' cannot appear here}} \
-                                          expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  __arm_streaming alignas(float) __arm_streaming alignas(float): base{}; // expected-error {{'__arm_streaming' cannot appear here}}
+class ATTR_USE ATTR_USE final_class // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE alignas(float) final // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                          expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE alignas(float) ATTR_USE alignas(float): base{}; // expected-error {{'ATTR_NAME' cannot appear here}}
 
-class __arm_streaming __arm_streaming final_class_another // expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
-  __arm_streaming __arm_streaming alignas(16) final // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                       expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
-  __arm_streaming __arm_streaming alignas(16) __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}}
+class ATTR_USE ATTR_USE final_class_another // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE ATTR_USE alignas(16) final // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                       expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE ATTR_USE alignas(16) ATTR_USE{}; // expected-error {{'ATTR_NAME' cannot appear here}}
 
-class after_class_close {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "class" to apply it to the type declaration}}
+class after_class_close {} ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here, place it after "class" to apply it to the type declaration}}
 
 class C {};
 
-__arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}}
-__arm_streaming struct no_init_declarators; // expected-error {{misplaced '__arm_streaming'}}
-template<typename> __arm_streaming struct no_init_declarators_template; // expected-error {{'__arm_streaming' cannot appear here}}
+ATTR_USE struct with_init_declarators {} init_declarator; // expected-error {{'ATTR_NAME' only applies to function types}}
+ATTR_USE struct no_init_declarators; // expected-error {{misplaced 'ATTR_NAME'}}
+template<typename> ATTR_USE struct no_init_declarators_template; // expected-error {{'ATTR_NAME' cannot appear here}}
 void fn_with_structs() {
-  __arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}}
-  __arm_streaming struct no_init_declarators; // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE struct with_init_declarators {} init_declarator; // expected-error {{'ATTR_NAME' only applies to function types}}
+  ATTR_USE struct no_init_declarators; // expected-error {{'ATTR_NAME' cannot appear here}}
 }
-__arm_streaming; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+ATTR_USE; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 struct ctordtor {
-  __arm_streaming ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-  ctordtor (C) __arm_streaming;
-  __arm_streaming ~ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
+  ATTR_USE ctordtor ATTR_USE () ATTR_USE; // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
+  ctordtor (C) ATTR_USE;
+  ATTR_USE ~ctordtor ATTR_USE () ATTR_USE; // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
 };
-__arm_streaming ctordtor::ctordtor __arm_streaming () __arm_streaming {} // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming ctordtor::ctordtor (C) __arm_streaming try {} catch (...) {} // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming ctordtor::~ctordtor __arm_streaming () __arm_streaming {} // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-extern "C++" __arm_streaming int extern_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-template <typename T> __arm_streaming void template_attr (); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming __arm_streaming int __arm_streaming __arm_streaming multi_attr __arm_streaming __arm_streaming; // expected-error 6 {{'__arm_streaming' only applies to function types}}
-
-int (paren_attr) __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}}
-unsigned __arm_streaming int attr_in_decl_spec; // expected-error {{'__arm_streaming' cannot appear here}}
-unsigned __arm_streaming int __arm_streaming const double_decl_spec = 0; // expected-error 2 {{'__arm_streaming' cannot appear here}}
+ATTR_USE ctordtor::ctordtor ATTR_USE () ATTR_USE {} // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
+ATTR_USE ctordtor::ctordtor (C) ATTR_USE try {} catch (...) {} // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+ATTR_USE ctordtor::~ctordtor ATTR_USE () ATTR_USE {} // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
+extern "C++" ATTR_USE int extern_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+template <typename T> ATTR_USE void template_attr (); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+ATTR_USE ATTR_USE int ATTR_USE ATTR_USE multi_attr ATTR_USE ATTR_USE; // expected-error 6 {{'ATTR_NAME' only applies to function types}}
+
+int (paren_attr) ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here}}
+unsigned ATTR_USE int attr_in_decl_spec; // expected-error {{'ATTR_NAME' cannot appear here}}
+unsigned ATTR_USE int ATTR_USE const double_decl_spec = 0; // expected-error 2 {{'ATTR_NAME' cannot appear here}}
 class foo {
-  void const_after_attr () __arm_streaming const; // expected-error {{expected ';'}}
+  void const_after_attr () ATTR_USE const; // expected-error {{expected ';'}}
 };
-extern "C++" __arm_streaming { } // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming extern "C++" { } // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming template <typename T> void before_template_attr (); // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming namespace ns { int i; } // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming static_assert(true, ""); //expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming asm(""); // expected-error {{'__arm_streaming' cannot appear here}}
-
-__arm_streaming using ns::i; // expected-warning {{ISO C++}} \
-                                expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-__arm_streaming using namespace ns; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-namespace __arm_streaming ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \
-                                    expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-
-using __arm_streaming alignas(4)__arm_streaming ns::i;          // expected-warning 2 {{ISO C++}} \
-                                                                   expected-error {{'__arm_streaming' cannot appear here}} \
+extern "C++" ATTR_USE { } // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE extern "C++" { } // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE template <typename T> void before_template_attr (); // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE namespace ns { int i; } // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE static_assert(true, ""); //expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE asm(""); // expected-error {{'ATTR_NAME' cannot appear here}}
+
+ATTR_USE using ns::i; // expected-warning {{ISO C++}} \
+                                expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+ATTR_USE using namespace ns; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+namespace ATTR_USE ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \
+                                    expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+
+using ATTR_USE alignas(4)ATTR_USE ns::i;          // expected-warning 2 {{ISO C++}} \
+                                                                   expected-error {{'ATTR_NAME' cannot appear here}} \
                                                                    expected-error {{'alignas' attribute only applies to variables, data members and tag types}} \
                                                                    expected-warning {{ISO C++}} \
-                                                                   expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
-using __arm_streaming alignas(4) __arm_streaming foobar = int; // expected-error {{'__arm_streaming' cannot appear here}} \
+                                                                   expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+using ATTR_USE alignas(4) ATTR_USE foobar = int; // expected-error {{'ATTR_NAME' cannot appear here}} \
                                                                   expected-error {{'alignas' attribute only applies to}} \
-                                                                  expected-error 2 {{'__arm_streaming' only applies to function types}}
-
-__arm_streaming using T = int; // expected-error {{'__arm_streaming' cannot appear here}}
-using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}}
-template<typename T> using U __arm_streaming = T; // expected-error {{'__arm_streaming' only applies to function types}}
-using ns::i __arm_streaming; // expected-warning {{ISO C++}} \
-                                expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-using ns::i __arm_streaming, ns::i __arm_streaming; // expected-warning 2 {{ISO C++}} \
+                                                                  expected-error 2 {{'ATTR_NAME' only applies to function types}}
+
+ATTR_USE using T = int; // expected-error {{'ATTR_NAME' cannot appear here}}
+using T ATTR_USE = int; // expected-error {{'ATTR_NAME' only applies to function types}}
+template<typename T> using U ATTR_USE = T; // expected-error {{'ATTR_NAME' only applies to function types}}
+using ns::i ATTR_USE; // expected-warning {{ISO C++}} \
+                                expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+using ns::i ATTR_USE, ns::i ATTR_USE; // expected-warning 2 {{ISO C++}} \
                                                        expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \
-                                                       expected-error 2 {{'__arm_streaming' only applies to non-K&R-style functions}}
+                                                       expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 struct using_in_struct_base {
   typedef int i, j, k, l;
 };
 struct using_in_struct : using_in_struct_base {
-  __arm_streaming using using_in_struct_base::i; // expected-warning {{ISO C++}} \
-                                                    expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  using using_in_struct_base::j __arm_streaming; // expected-warning {{ISO C++}} \
-                                                    expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  __arm_streaming using using_in_struct_base::k __arm_streaming, using_in_struct_base::l __arm_streaming; // expected-warning 3 {{ISO C++}} \
+  ATTR_USE using using_in_struct_base::i; // expected-warning {{ISO C++}} \
+                                                    expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  using using_in_struct_base::j ATTR_USE; // expected-warning {{ISO C++}} \
+                                                    expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE using using_in_struct_base::k ATTR_USE, using_in_struct_base::l ATTR_USE; // expected-warning 3 {{ISO C++}} \
                                                                                                              expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \
-                                                                                                             expected-error 4 {{'__arm_streaming' only applies to non-K&R-style functions}}
+                                                                                                             expected-error 4 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 };
-using __arm_streaming ns::i; // expected-warning {{ISO C++}} \
-                                expected-error {{'__arm_streaming' cannot appear here}} \
-                                expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}}
+using ATTR_USE ns::i; // expected-warning {{ISO C++}} \
+                                expected-error {{'ATTR_NAME' cannot appear here}} \
+                                expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+using T ATTR_USE = int; // expected-error {{'ATTR_NAME' only applies to function types}}
 
-auto trailing() -> __arm_streaming const int; // expected-error {{'__arm_streaming' cannot appear here}}
-auto trailing() -> const __arm_streaming int; // expected-error {{'__arm_streaming' cannot appear here}}
-auto trailing() -> const int __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-auto trailing_2() -> struct struct_attr __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
+auto trailing() -> ATTR_USE const int; // expected-error {{'ATTR_NAME' cannot appear here}}
+auto trailing() -> const ATTR_USE int; // expected-error {{'ATTR_NAME' cannot appear here}}
+auto trailing() -> const int ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+auto trailing_2() -> struct struct_attr ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
 
 namespace N {
   struct S {};
@@ -172,88 +175,88 @@ namespace N {
 template<typename> struct Template {};
 
 // FIXME: Improve this diagnostic
-struct __arm_streaming N::S s; // expected-error {{'__arm_streaming' cannot appear here}}
-struct __arm_streaming Template<int> t; // expected-error {{'__arm_streaming' cannot appear here}}
-struct __arm_streaming ::template Template<int> u; // expected-error {{'__arm_streaming' cannot appear here}}
-template struct __arm_streaming Template<char>; // expected-error {{'__arm_streaming' cannot appear here}}
+struct ATTR_USE N::S s; // expected-error {{'ATTR_NAME' cannot appear here}}
+struct ATTR_USE Template<int> t; // expected-error {{'ATTR_NAME' cannot appear here}}
+struct ATTR_USE ::template Template<int> u; // expected-error {{'ATTR_NAME' cannot appear here}}
+template struct ATTR_USE Template<char>; // expected-error {{'ATTR_NAME' cannot appear here}}
 template struct __attribute__((pure)) Template<std::size_t>; // We still allow GNU-style attributes here
-template <> struct __arm_streaming Template<void>; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-
-enum __arm_streaming E1 {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum __arm_streaming E2; // expected-error {{forbids forward references}} \
-                            expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum __arm_streaming E1; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum __arm_streaming E3 : int; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum __arm_streaming { // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-  k_123 __arm_streaming = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \
-                                 expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+template <> struct ATTR_USE Template<void>; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+
+enum ATTR_USE E1 {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E2; // expected-error {{forbids forward references}} \
+                            expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E1; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E3 : int; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  k_123 ATTR_USE = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \
+                                 expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 };
-enum __arm_streaming E1 e; // expected-error {{'__arm_streaming' cannot appear here}}
-enum __arm_streaming class E4 { }; // expected-error {{'__arm_streaming' cannot appear here}}
-enum struct __arm_streaming E5; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
-enum E6 {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "enum" to apply it to the type declaration}}
+enum ATTR_USE E1 e; // expected-error {{'ATTR_NAME' cannot appear here}}
+enum ATTR_USE class E4 { }; // expected-error {{'ATTR_NAME' cannot appear here}}
+enum struct ATTR_USE E5; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum E6 {} ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here, place it after "enum" to apply it to the type declaration}}
 
 struct S {
-  friend int f __arm_streaming (); // expected-error {{'__arm_streaming' cannot appear here}} \
-                                      expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  friend int f2 __arm_streaming () {} // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming friend int g(); // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming friend int h() { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+  friend int f ATTR_USE (); // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                      expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  friend int f2 ATTR_USE () {} // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  ATTR_USE friend int g(); // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE friend int h() { // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
   }
-  __arm_streaming friend int f3(), f4(), f5(); // expected-error {{'__arm_streaming' cannot appear here}}
-  friend int f6 __arm_streaming (), f7 __arm_streaming (), f8 __arm_streaming (); // expected-error3 {{'__arm_streaming' cannot appear here}} \
-                                                                                     expected-error 3 {{'__arm_streaming' cannot be applied to a declaration}}
-  friend class __arm_streaming C; // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming friend class D; // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming friend int; // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE friend int f3(), f4(), f5(); // expected-error {{'ATTR_NAME' cannot appear here}}
+  friend int f6 ATTR_USE (), f7 ATTR_USE (), f8 ATTR_USE (); // expected-error3 {{'ATTR_NAME' cannot appear here}} \
+                                                                                     expected-error 3 {{'ATTR_NAME' cannot be applied to a declaration}}
+  friend class ATTR_USE C; // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE friend class D; // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE friend int; // expected-error {{'ATTR_NAME' cannot appear here}}
 };
 template<typename T> void tmpl (T) {}
-template __arm_streaming void tmpl(char); // expected-error {{'__arm_streaming' cannot appear here}}
-template void __arm_streaming tmpl(short); // expected-error {{'__arm_streaming' only applies to function types}}
+template ATTR_USE void tmpl(char); // expected-error {{'ATTR_NAME' cannot appear here}}
+template void ATTR_USE tmpl(short); // expected-error {{'ATTR_NAME' only applies to function types}}
 
 // Statement tests
 void foo () {
-  __arm_streaming ; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming { } // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming if (0) { } // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming do { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-    __arm_streaming continue; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE ; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE { } // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE if (0) { } // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE for (;;); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE do { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+    ATTR_USE continue; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
   } while (0);
-  __arm_streaming while (0); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE while (0); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming switch (i) { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-    __arm_streaming case 0: // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-    __arm_streaming default: // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-      __arm_streaming break; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE switch (i) { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+    ATTR_USE case 0: // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+    ATTR_USE default: // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+      ATTR_USE break; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
   }
 
-  __arm_streaming goto there; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming there: // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+  ATTR_USE goto there; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE there: // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-  __arm_streaming try { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  } __arm_streaming catch (...) { // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE try { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  } ATTR_USE catch (...) { // expected-error {{'ATTR_NAME' cannot appear here}}
   }
 
-  void bar __arm_streaming (__arm_streaming int i, __arm_streaming int j); // expected-error 2 {{'__arm_streaming' only applies to function types}} \
-                                                                              expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  using FuncType = void (__arm_streaming int); // expected-error {{'__arm_streaming' only applies to function types}}
-  void baz(__arm_streaming...); // expected-error {{expected parameter declarator}}
+  void bar ATTR_USE (ATTR_USE int i, ATTR_USE int j); // expected-error 2 {{'ATTR_NAME' only applies to function types}} \
+                                                                              expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  using FuncType = void (ATTR_USE int); // expected-error {{'ATTR_NAME' only applies to function types}}
+  void baz(ATTR_USE...); // expected-error {{expected parameter declarator}}
 
-  __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE return; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 }
 
 // Expression tests
 void bar () {
-  new int[42]__arm_streaming[5]__arm_streaming{}; // expected-error {{'__arm_streaming' only applies to function types}}
+  new int[42]ATTR_USE[5]ATTR_USE{}; // expected-error {{'ATTR_NAME' only applies to function types}}
 }
 
 // Condition tests
 void baz () {
-  if (__arm_streaming bool b = true) { // expected-error {{'__arm_streaming' only applies to function types}}
-    switch (__arm_streaming int n { 42 }) { // expected-error {{'__arm_streaming' only applies to function types}}
+  if (ATTR_USE bool b = true) { // expected-error {{'ATTR_NAME' only applies to function types}}
+    switch (ATTR_USE int n { 42 }) { // expected-error {{'ATTR_NAME' only applies to function types}}
     default:
-      for (__arm_streaming int n = 0; __arm_streaming char b = n < 5; ++b) { // expected-error 2 {{'__arm_streaming' only applies to function types}}
+      for (ATTR_USE int n = 0; ATTR_USE char b = n < 5; ++b) { // expected-error 2 {{'ATTR_NAME' only applies to function types}}
       }
     }
   }
@@ -261,37 +264,37 @@ void baz () {
   // An attribute can be applied to an expression-statement, such as the first
   // statement in a for. But it can't be applied to a condition which is an
   // expression.
-  for (__arm_streaming x = 0; ; ) {} // expected-error {{'__arm_streaming' cannot appear here}}
-  for (; __arm_streaming x < 5; ) {} // expected-error {{'__arm_streaming' cannot appear here}}
-  while (__arm_streaming bool k { false }) { // expected-error {{'__arm_streaming' only applies to function types}}
+  for (ATTR_USE x = 0; ; ) {} // expected-error {{'ATTR_NAME' cannot appear here}}
+  for (; ATTR_USE x < 5; ) {} // expected-error {{'ATTR_NAME' cannot appear here}}
+  while (ATTR_USE bool k { false }) { // expected-error {{'ATTR_NAME' only applies to function types}}
   }
-  while (__arm_streaming true) { // expected-error {{'__arm_streaming' cannot appear here}}
+  while (ATTR_USE true) { // expected-error {{'ATTR_NAME' cannot appear here}}
   }
   do {
-  } while (__arm_streaming false); // expected-error {{'__arm_streaming' cannot appear here}}
+  } while (ATTR_USE false); // expected-error {{'ATTR_NAME' cannot appear here}}
 
-  for (__arm_streaming int n : { 1, 2, 3 }) { // expected-error {{'__arm_streaming' only applies to function types}}
+  for (ATTR_USE int n : { 1, 2, 3 }) { // expected-error {{'ATTR_NAME' only applies to function types}}
   }
 }
 
 enum class __attribute__((visibility("hidden"))) SecretKeepers {
   one, /* rest are deprecated */ two, three
 };
-enum class __arm_streaming EvenMoreSecrets {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+enum class ATTR_USE EvenMoreSecrets {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
 // Forbid attributes on decl specifiers.
-unsigned __arm_streaming static int __arm_streaming v1; // expected-error {{'__arm_streaming' only applies to function types}} \
-           expected-error {{'__arm_streaming' cannot appear here}}
-typedef __arm_streaming unsigned long __arm_streaming v2; // expected-error {{'__arm_streaming' only applies to function types}} \
-          expected-error {{'__arm_streaming' cannot appear here}}
-int __arm_streaming foo(int __arm_streaming x); // expected-error 2 {{'__arm_streaming' only applies to function types}}
+unsigned ATTR_USE static int ATTR_USE v1; // expected-error {{'ATTR_NAME' only applies to function types}} \
+           expected-error {{'ATTR_NAME' cannot appear here}}
+typedef ATTR_USE unsigned long ATTR_USE v2; // expected-error {{'ATTR_NAME' only applies to function types}} \
+          expected-error {{'ATTR_NAME' cannot appear here}}
+int ATTR_USE foo(int ATTR_USE x); // expected-error 2 {{'ATTR_NAME' only applies to function types}}
 
-__arm_streaming; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}}
+ATTR_USE; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
 class A {
-  A(__arm_streaming int a); // expected-error {{'__arm_streaming' only applies to function types}}
+  A(ATTR_USE int a); // expected-error {{'ATTR_NAME' only applies to function types}}
 };
-A::A(__arm_streaming int a) {} // expected-error {{'__arm_streaming' only applies to function types}}
+A::A(ATTR_USE int a) {} // expected-error {{'ATTR_NAME' only applies to function types}}
 
 template<typename T> struct TemplateStruct {};
 class FriendClassesWithAttributes {
@@ -299,47 +302,47 @@ class FriendClassesWithAttributes {
   template <class _Tp, class _Alloc> friend class __attribute__((__type_visibility__("default"))) vector;
   template <class _Tp, class _Alloc> friend class __declspec(code_seg("foo,whatever")) vector2;
   // But not C++11 ones
-  template <class _Tp, class _Alloc> friend class __arm_streaming vector3;                                         // expected-error {{'__arm_streaming' cannot appear here}}
+  template <class _Tp, class _Alloc> friend class ATTR_USE vector3;                                         // expected-error {{'ATTR_NAME' cannot appear here}}
 
   // Also allowed
   friend struct __attribute__((__type_visibility__("default"))) TemplateStruct<FriendClassesWithAttributes>;
   friend struct __declspec(code_seg("foo,whatever")) TemplateStruct<FriendClassesWithAttributes>;
-  friend struct __arm_streaming TemplateStruct<FriendClassesWithAttributes>;                                       // expected-error {{'__arm_streaming' cannot appear here}}
+  friend struct ATTR_USE TemplateStruct<FriendClassesWithAttributes>;                                       // expected-error {{'ATTR_NAME' cannot appear here}}
 };
 
 // Check ordering: C++11 attributes must appear before GNU attributes.
 class Ordering {
   void f1(
-    int (__arm_streaming __attribute__(()) int n) // expected-error {{'__arm_streaming' only applies to function types}}
+    int (ATTR_USE __attribute__(()) int n) // expected-error {{'ATTR_NAME' only applies to function types}}
   ) {
   }
 
   void f2(
-      int (*)(__arm_streaming __attribute__(()) int n) // expected-error {{'__arm_streaming' only applies to function types}}
+      int (*)(ATTR_USE __attribute__(()) int n) // expected-error {{'ATTR_NAME' only applies to function types}}
   ) {
   }
 
   void f3(
-    int (__attribute__(()) __arm_streaming int n) // expected-error {{'__arm_streaming' cannot appear here}}
+    int (__attribute__(()) ATTR_USE int n) // expected-error {{'ATTR_NAME' cannot appear here}}
   ) {
   }
 
   void f4(
-      int (*)(__attribute__(()) __arm_streaming int n) // expected-error {{'__arm_streaming' cannot appear here}}
+      int (*)(__attribute__(()) ATTR_USE int n) // expected-error {{'ATTR_NAME' cannot appear here}}
   ) {
   }
 };
 
 namespace base_specs {
 struct A {};
-struct B : __arm_streaming A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct C : __arm_streaming virtual A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct D : __arm_streaming public virtual A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct E : public __arm_streaming virtual A {}; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                   expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct F : virtual __arm_streaming public A {}; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                   expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
+struct B : ATTR_USE A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct C : ATTR_USE virtual A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct D : ATTR_USE public virtual A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct E : public ATTR_USE virtual A {}; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                   expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct F : virtual ATTR_USE public A {}; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                   expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
 }
 
-namespace __arm_streaming ns_attr {}; // expected-error {{'__arm_streaming' only applies to non-K&R-style functions}} \
+namespace ATTR_USE ns_attr {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} \
                                          expected-warning {{attributes on a namespace declaration are a C++17 extension}}

diff  --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
index e63d9f0a8475763..476da8534ce76ff 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -23,7 +23,7 @@ int16x8_t incompat_neon_smc(int16x8_t splat) __arm_streaming_compatible {
   return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
 }
 
-void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_shared_za {
+void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_inout("za") {
   // expected-warning at +1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
   return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr);
 }
@@ -58,7 +58,7 @@ svuint32_t incompat_sve2_smc(svbool_t pg, svuint32_t a, int64_t b) __arm_streami
   return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b);
 }
 
-void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_shared_za {
+void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_inout("za") {
   // expected-warning at +1 {{builtin call has undefined behaviour when called from a non-streaming function}}
   svmops_za32_f32_m(0, pn, pm, zn, zm);
 }

diff  --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
index b59d67f7f57b886..0a54a94f408b795 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -4,16 +4,16 @@
 
 void streaming_compatible_def() __arm_streaming_compatible {} // OK
 void streaming_def() __arm_streaming { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
-void shared_za_def() __arm_shared_za { } // expected-error {{function using ZA state requires 'sme'}}
-__arm_new_za void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}}
+void shared_za_def() __arm_inout("za") { } // expected-error {{function using ZA state requires 'sme'}}
+__arm_new("za") void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}}
 __arm_locally_streaming void locally_streaming_def() { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
-void streaming_shared_za_def() __arm_streaming __arm_shared_za { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+void streaming_shared_za_def() __arm_streaming __arm_inout("za") { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
 
 // It should work fine when we explicitly add the target("sme") attribute.
 __attribute__((target("sme"))) void streaming_compatible_def_sme_attr() __arm_streaming_compatible {} // OK
 __attribute__((target("sme"))) void streaming_def_sme_attr() __arm_streaming { } // OK
-__attribute__((target("sme"))) void shared_za_def_sme_attr() __arm_shared_za { } // OK
-__arm_new_za __attribute__((target("sme"))) void new_za_def_sme_attr() {} // OK
+__attribute__((target("sme"))) void shared_za_def_sme_attr() __arm_inout("za") { } // OK
+__arm_new("za") __attribute__((target("sme"))) void new_za_def_sme_attr() {} // OK
 __arm_locally_streaming __attribute__((target("sme"))) void locally_streaming_def_sme_attr() {} // OK
 
 // Test that it also works with the target("sme2") attribute.
@@ -22,7 +22,7 @@ __attribute__((target("sme2"))) void streaming_def_sme2_attr() __arm_streaming {
 // No code is generated for declarations, so it should be fine to declare using the attribute.
 void streaming_compatible_decl() __arm_streaming_compatible; // OK
 void streaming_decl() __arm_streaming; // OK
-void shared_za_decl() __arm_shared_za; // OK
+void shared_za_decl() __arm_inout("za"); // OK
 
 void non_streaming_decl();
 void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming,

diff  --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 73c0934d689e7c9..b986b0b3de2e14d 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -6,27 +6,25 @@
 void sme_arm_streaming(void) __arm_streaming;
 void sme_arm_streaming_compatible(void) __arm_streaming_compatible;
 
-__arm_new_za void sme_arm_new_za(void) {}
-void sme_arm_shared_za(void) __arm_shared_za;
-void sme_arm_preserves_za(void) __arm_preserves_za;
+__arm_new("za") void sme_arm_new_za(void) {}
+void sme_arm_shared_za(void) __arm_inout("za");
+void sme_arm_preserves_za(void) __arm_preserves("za");
 
-__arm_new_za void sme_arm_streaming_new_za(void) __arm_streaming {}
-void sme_arm_streaming_shared_za(void) __arm_streaming __arm_shared_za;
-void sme_arm_streaming_preserves_za(void) __arm_streaming __arm_preserves_za;
+__arm_new("za") void sme_arm_streaming_new_za(void) __arm_streaming {}
+void sme_arm_streaming_shared_za(void) __arm_streaming __arm_inout("za");
+void sme_arm_streaming_preserves_za(void) __arm_streaming __arm_preserves("za");
 
-__arm_new_za void sme_arm_sc_new_za(void) __arm_streaming_compatible {}
-void sme_arm_sc_shared_za(void) __arm_streaming_compatible __arm_shared_za;
-void sme_arm_sc_preserves_za(void) __arm_streaming_compatible __arm_preserves_za;
-
-void sme_arm_shared_preserves_za(void) __arm_shared_za __arm_preserves_za;
+__arm_new("za") void sme_arm_sc_new_za(void) __arm_streaming_compatible {}
+void sme_arm_sc_shared_za(void) __arm_streaming_compatible __arm_inout("za");
+void sme_arm_sc_preserves_za(void) __arm_streaming_compatible __arm_preserves("za");
 
 __arm_locally_streaming void sme_arm_locally_streaming(void) { }
 __arm_locally_streaming void sme_arm_streaming_and_locally_streaming(void) __arm_streaming { }
 __arm_locally_streaming void sme_arm_streaming_and_streaming_compatible(void) __arm_streaming_compatible { }
 
-__arm_locally_streaming __arm_new_za void sme_arm_ls_new_za(void) { }
-__arm_locally_streaming void sme_arm_ls_shared_za(void) __arm_shared_za { }
-__arm_locally_streaming void sme_arm_ls_preserves_za(void) __arm_preserves_za { }
+__arm_locally_streaming __arm_new("za") void sme_arm_ls_new_za(void) { }
+__arm_locally_streaming void sme_arm_ls_shared_za(void) __arm_inout("za") { }
+__arm_locally_streaming void sme_arm_ls_preserves_za(void) __arm_preserves("za") { }
 
 // Valid attributes on function pointers
 
@@ -38,18 +36,14 @@ void streaming_compatible_ptr(void) __arm_streaming_compatible;
 typedef void (*fptrty2) (void) __arm_streaming_compatible;
 fptrty2 call_sc_func() { return streaming_compatible_ptr; }
 
-void shared_za_ptr(void) __arm_shared_za;
-typedef void (*fptrty3) (void) __arm_shared_za;
+void shared_za_ptr(void) __arm_inout("za");
+typedef void (*fptrty3) (void) __arm_inout("za");
 fptrty3 call_shared_za_func() { return shared_za_ptr; }
 
-void preserves_za_ptr(void) __arm_preserves_za;
-typedef void (*fptrty4) (void) __arm_preserves_za;
+void preserves_za_ptr(void) __arm_preserves("za");
+typedef void (*fptrty4) (void) __arm_preserves("za");
 fptrty4 call_preserve_za_func() { return preserves_za_ptr; }
 
-void shared_preserves_za_ptr(void) __arm_shared_za __arm_preserves_za;
-typedef void (*fptrty5) (void) __arm_shared_za __arm_preserves_za;
-fptrty5 call_shared_preserve_za_func() { return shared_preserves_za_ptr; }
-
 typedef void (*fptrty6) (void);
 fptrty6 cast_nza_func_to_normal() { return sme_arm_new_za; }
 fptrty6 cast_ls_func_to_normal() { return sme_arm_locally_streaming; }
@@ -68,13 +62,13 @@ void streaming_mode(void) __arm_streaming __arm_streaming_compatible;
 // expected-note at +1 {{conflicting attribute is here}}
 void streaming_compatible(void) __arm_streaming_compatible __arm_streaming;
 
-// expected-cpp-error at +2 {{'__arm_new_za' and '__arm_shared_za' are not compatible}}
-// expected-error at +1 {{'__arm_new_za' and '__arm_shared_za' are not compatible}}
-__arm_new_za void new_shared_za(void) __arm_shared_za {}
+// expected-cpp-error at +2 {{'__arm_new("za")' and '__arm_inout("za")' are not compatible}}
+// expected-error at +1 {{'__arm_new("za")' and '__arm_inout("za")' are not compatible}}
+__arm_new("za") void new_shared_za(void) __arm_inout("za") {}
 
-// expected-cpp-error at +2 {{'__arm_new_za' and '__arm_preserves_za' are not compatible}}
-// expected-error at +1 {{'__arm_new_za' and '__arm_preserves_za' are not compatible}}
-__arm_new_za void new_preserves_za(void) __arm_preserves_za {}
+// expected-cpp-error at +2 {{'__arm_new("za")' and '__arm_preserves("za")' are not compatible}}
+// expected-error at +1 {{'__arm_new("za")' and '__arm_preserves("za")' are not compatible}}
+__arm_new("za") void new_preserves_za(void) __arm_preserves("za") {}
 
 // Invalid attributes on function pointers
 
@@ -125,24 +119,25 @@ sc_ptrty return_invalid_fptr_streaming_compatible_normal(n_ptrty f) { return f;
 // expected-error at +1 {{incompatible function pointer types returning 'sc_ptrty' (aka 'void (*)(void) __arm_streaming_compatible') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
 n_ptrty return_invalid_fptr_normal_streaming_compatible(sc_ptrty f) { return f; }
 
-typedef void (*sz_ptrty) (void) __arm_shared_za;
+typedef void (*sz_ptrty) (void) __arm_inout("za");
 sz_ptrty return_valid_shared_za_fptr(sz_ptrty f) { return f; }
 
 
-// expected-cpp-error at +2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __arm_shared_za') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
-// expected-error at +1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __arm_shared_za')}}
+// expected-cpp-error at +2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __arm_inout("za")') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
+// expected-error at +1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __arm_inout("za")')}}
 sz_ptrty return_invalid_fptr_shared_za_normal(n_ptrty f) { return f; }
-// expected-cpp-error at +2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __arm_shared_za')}}
-// expected-error at +1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __arm_shared_za') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
+// expected-cpp-error at +2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __arm_inout("za")')}}
+// expected-error at +1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __arm_inout("za")') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
 n_ptrty return_invalid_fptr_normal_shared_za(sz_ptrty f) { return f; }
 
-typedef void (*pz_ptrty) (void) __arm_preserves_za;
+typedef void (*pz_ptrty) (void) __arm_preserves("za");
 pz_ptrty return_valid_preserves_za_fptr(pz_ptrty f) { return f; }
 
-// expected-cpp-error at +2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __arm_preserves_za') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
-// expected-error at +1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __arm_preserves_za')}}
+// expected-cpp-error at +2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __arm_preserves("za")') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
+// expected-error at +1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __arm_preserves("za")')}}
 pz_ptrty return_invalid_fptr_preserves_za_normal(n_ptrty f) { return f; }
-// No diagnostics, the preserves_za hint should be dropped silently.
+// expected-cpp-error at +2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'pz_ptrty' (aka 'void (*)() __arm_preserves("za")')}}
+// expected-error at +1 {{incompatible function pointer types returning 'pz_ptrty' (aka 'void (*)(void) __arm_preserves("za")') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
 n_ptrty return_invalid_fptr_normal_preserves_za(pz_ptrty f) { return f; }
 
 // Test template instantiations
@@ -164,21 +159,21 @@ template short templated<short>(short);
 void redecl(void) __arm_streaming;
 void redecl(void) __arm_streaming_compatible { }
 
-// expected-error at +5 {{function declared 'void (void) __arm_shared_za' was previously declared 'void (void) __arm_shared_za __arm_preserves_za', which has 
diff erent SME function attributes}}
+// expected-error at +5 {{function declared 'void (void)' was previously declared 'void (void) __arm_preserves("za")', which has 
diff erent SME function attributes}}
 // expected-note at +3 {{previous declaration is here}}
-// expected-cpp-error at +3 {{function declared 'void () __arm_shared_za' was previously declared 'void () __arm_shared_za __arm_preserves_za', which has 
diff erent SME function attributes}}
+// expected-cpp-error at +3 {{function declared 'void ()' was previously declared 'void () __arm_preserves("za")', which has 
diff erent SME function attributes}}
 // expected-cpp-note at +1 {{previous declaration is here}}
-void redecl_preserve_za(void) __arm_shared_za __arm_preserves_za;;
-void redecl_preserve_za(void) __arm_shared_za {}
+void redecl_preserve_za(void) __arm_preserves("za");;
+void redecl_preserve_za(void) {}
 
-// expected-error at +5 {{function declared 'void (void) __arm_shared_za __arm_preserves_za' was previously declared 'void (void) __arm_shared_za', which has 
diff erent SME function attributes}}
+// expected-error at +5 {{function declared 'void (void) __arm_preserves("za")' was previously declared 'void (void)', which has 
diff erent SME function attributes}}
 // expected-note at +3 {{previous declaration is here}}
-// expected-cpp-error at +3 {{function declared 'void () __arm_shared_za __arm_preserves_za' was previously declared 'void () __arm_shared_za', which has 
diff erent SME function attributes}}
+// expected-cpp-error at +3 {{function declared 'void () __arm_preserves("za")' was previously declared 'void ()', which has 
diff erent SME function attributes}}
 // expected-cpp-note at +1 {{previous declaration is here}}
-void redecl_nopreserve_za(void) __arm_shared_za;
-void redecl_nopreserve_za(void) __arm_shared_za __arm_preserves_za {}
+void redecl_nopreserve_za(void);
+void redecl_nopreserve_za(void) __arm_preserves("za") {}
 
-void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) {
+void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za"), void (*preserves_za_fn_ptr)(void) __arm_preserves("za")) {
   sme_arm_new_za(); // OK
   // expected-error at +2 {{call to a shared ZA function requires the caller to have ZA state}}
   // expected-cpp-error at +1 {{call to a shared ZA function requires the caller to have ZA state}}
@@ -186,43 +181,46 @@ void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) {
   // expected-error at +2 {{call to a shared ZA function requires the caller to have ZA state}}
   // expected-cpp-error at +1 {{call to a shared ZA function requires the caller to have ZA state}}
   shared_za_fn_ptr();
+  // expected-error at +2 {{call to a shared ZA function requires the caller to have ZA state}}
+  // expected-cpp-error at +1 {{call to a shared ZA function requires the caller to have ZA state}}
+  preserves_za_fn_ptr();
 }
 
-void shared_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) __arm_shared_za {
+void shared_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za")) __arm_inout("za") {
   sme_arm_shared_za(); // OK
   shared_za_fn_ptr(); // OK
 }
 
-__arm_new_za void new_za_definition(void (*shared_za_fn_ptr)(void) __arm_shared_za) {
+__arm_new("za") void new_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za")) {
   sme_arm_shared_za(); // OK
   shared_za_fn_ptr(); // OK
 }
 
 #ifdef __cplusplus
-int shared_za_initializer(void) __arm_shared_za;
+int shared_za_initializer(void) __arm_inout("za");
 // expected-cpp-error at +1 {{call to a shared ZA function requires the caller to have ZA state}}
 int global = shared_za_initializer();
 
 struct S {
-  virtual void shared_za_memberfn(void) __arm_shared_za;
+  virtual void shared_za_memberfn(void) __arm_inout("za");
 };
 
 struct S2 : public S {
-// expected-cpp-error at +2 {{virtual function 'shared_za_memberfn' has 
diff erent attributes ('void ()') than the function it overrides (which has 'void () __arm_shared_za')}}
+// expected-cpp-error at +2 {{virtual function 'shared_za_memberfn' has 
diff erent attributes ('void ()') than the function it overrides (which has 'void () __arm_inout("za")')}}
 // expected-cpp-note at -5 {{overridden virtual function is here}}
-  __arm_new_za void shared_za_memberfn(void) override {}
+  __arm_new("za") void shared_za_memberfn(void) override {}
 };
 
-// The '__arm_preserves_za' property cannot be dropped when overriding a virtual
-// function. It is however fine for the overriding function to be '__arm_preserves_za'
+// The '__arm_preserves("za")' property cannot be dropped when overriding a virtual
+// function. It is however fine for the overriding function to be '__arm_preserves("za")'
 // even though the function that it overrides is not.
 
 struct S_PreservesZA {
-  virtual void memberfn(void) __arm_preserves_za;
+  virtual void memberfn(void) __arm_preserves("za");
 };
 
 struct S_Drop_PreservesZA : S_PreservesZA {
-// expected-cpp-error at +2 {{virtual function 'memberfn' has 
diff erent attributes ('void ()') than the function it overrides (which has 'void () __arm_preserves_za')}}
+// expected-cpp-error at +2 {{virtual function 'memberfn' has 
diff erent attributes ('void ()') than the function it overrides (which has 'void () __arm_preserves("za")')}}
 // expected-cpp-note at -5 {{overridden virtual function is here}}
   void memberfn(void) override {}
 };
@@ -230,9 +228,11 @@ struct S_Drop_PreservesZA : S_PreservesZA {
 struct S_NoPreservesZA {
   virtual void memberfn(void);
 };
+
 struct S_AddPreservesZA : S_NoPreservesZA {
-// This is fine, the overridden function just adds more guarantees.
-  void memberfn(void) __arm_preserves_za override {}
+// expected-cpp-error at +2 {{virtual function 'memberfn' has 
diff erent attributes ('void () __arm_preserves("za")') than the function it overrides (which has 'void ()')}}
+// expected-cpp-note at -5 {{overridden virtual function is here}}
+  void memberfn(void) __arm_preserves("za") override {}
 };
 
 
@@ -258,20 +258,20 @@ struct S3<void (* __arm_streaming_compatible)()> {
 };
 
 template <>
-struct S3<void (* __arm_shared_za)()> {
+struct S3<void (* __arm_inout("za"))()> {
   static constexpr int value = 8;
 };
 
 template <>
-struct S3<void (* __arm_preserves_za)()> {
+struct S3<void (* __arm_preserves("za"))()> {
   static constexpr int value = 16;
 };
 
 void normal_func(void) {}
 void streaming_func(void) __arm_streaming {}
 void streaming_compatible_func(void) __arm_streaming_compatible {}
-void shared_za_func(void) __arm_shared_za {}
-void preserves_za_func(void) __arm_preserves_za {}
+void shared_za_func(void) __arm_inout("za") {}
+void preserves_za_func(void) __arm_preserves("za") {}
 
 static_assert(S3<decltype(+normal_func)>::value == 1, "why are we picking the wrong specialization?");
 static_assert(S3<decltype(+streaming_func)>::value == 2, "why are we picking the wrong specialization?");
@@ -295,8 +295,8 @@ template<typename T> int test_templated_f(T);
 template<> constexpr int test_templated_f<void(*)(void)>(void(*)(void)) { return 1; }
 template<> constexpr int test_templated_f<void(*)(void)__arm_streaming>(void(*)(void)__arm_streaming) { return 2; }
 template<> constexpr int test_templated_f<void(*)(void)__arm_streaming_compatible>(void(*)(void)__arm_streaming_compatible) { return 4; }
-template<> constexpr int test_templated_f<void(*)(void)__arm_shared_za>(void(*)(void)__arm_shared_za) { return 8; }
-template<> constexpr int test_templated_f<void(*)(void)__arm_preserves_za>(void(*)(void)__arm_preserves_za) { return 16; }
+template<> constexpr int test_templated_f<void(*)(void)__arm_inout("za")>(void(*)(void)__arm_inout("za")) { return 8; }
+template<> constexpr int test_templated_f<void(*)(void)__arm_preserves("za")>(void(*)(void)__arm_preserves("za")) { return 16; }
 
 static_assert(test_templated_f(&normal_func) == 1, "Instantiated to wrong function");
 static_assert(test_templated_f(&streaming_func) == 2, "Instantiated to wrong function");
@@ -312,8 +312,8 @@ int invalid_type_for_attribute __arm_streaming;
 constexpr int overload(void f(void)) { return 1; }
 constexpr int overload(void f(void) __arm_streaming) { return 2; }
 constexpr int overload(void f(void) __arm_streaming_compatible) { return 4; }
-constexpr int overload(void f(void) __arm_shared_za) { return 8; }
-constexpr int overload(void f(void) __arm_preserves_za) { return 16; }
+constexpr int overload(void f(void) __arm_inout("za")) { return 8; }
+constexpr int overload(void f(void) __arm_preserves("za")) { return 16; }
 static_assert(overload(&normal_func) == 1, "Overloaded to wrong function");
 static_assert(overload(&streaming_func) == 2, "Overloaded to wrong function");
 static_assert(overload(&streaming_compatible_func) == 4, "Overloaded to wrong function");
@@ -330,3 +330,73 @@ constexpr X<int> *ptr = 0;
 static_assert(overload_int(ptr->foo) == 2, "Overloaded to the wrong function after implicit instantiation");
 
 #endif // ifdef __cplusplus
+
+// expected-cpp-error at +2 {{unknown state ''}}
+// expected-error at +1 {{unknown state ''}}
+__arm_new("") void invalid_arm_new_empty_string(void);
+// expected-cpp-error at +2 {{expected string literal as argument of '__arm_new' attribute}}
+// expected-error at +1 {{expected string literal as argument of '__arm_new' attribute}}
+__arm_new(0) void invalid_arm_new_non_literal_string(void);
+// expected-cpp-error at +2 {{unknown state 'unknownstate'}}
+// expected-error at +1 {{unknown state 'unknownstate'}}
+__arm_new("unknownstate") void invalid_arm_new_unknown_state(void);
+
+// expected-cpp-error at +2 {{unknown state ''}}
+// expected-error at +1 {{unknown state ''}}
+void invalid_arm_in_empty_string(void) __arm_in("");
+// expected-cpp-error at +2 {{expected string literal as argument of '__arm_in' attribute}}
+// expected-error at +1 {{expected string literal as argument of '__arm_in' attribute}}
+void invalid_arm_in_non_literal_string(void) __arm_in(0);
+// expected-cpp-error at +2 {{unknown state 'unknownstate'}}
+// expected-error at +1 {{unknown state 'unknownstate'}}
+void invalid_arm_in_unknown_state(void) __arm_in("unknownstate");
+
+void valid_state_attrs_in_in1(void) __arm_in("za");
+void valid_state_attrs_in_in2(void) __arm_in("za", "za");
+
+// expected-cpp-error at +2 {{missing state for '__arm_in'}}
+// expected-error at +1 {{missing state for '__arm_in'}}
+void invalid_state_attrs_no_arg1(void) __arm_in();
+// expected-cpp-error at +2 {{missing state for '__arm_new'}}
+// expected-error at +1 {{missing state for '__arm_new'}}
+__arm_new() void invalid_state_attrs_no_arg2(void);
+
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_in_out(void) __arm_in("za") __arm_out("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_in_inout(void) __arm_in("za") __arm_inout("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_in_preserves(void) __arm_in("za") __arm_preserves("za");
+
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_out_in(void) __arm_out("za") __arm_in("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_out_inout(void) __arm_out("za") __arm_inout("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_out_preserves(void) __arm_out("za") __arm_preserves("za");
+
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_inout_in(void) __arm_inout("za") __arm_in("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_inout_out(void) __arm_inout("za") __arm_out("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_inout_preserves(void) __arm_inout("za") __arm_preserves("za");
+
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_preserves_in(void) __arm_preserves("za") __arm_in("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_preserves_out(void) __arm_preserves("za") __arm_out("za");
+// expected-cpp-error at +2 {{conflicting attributes for state 'za'}}
+// expected-error at +1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_preserves_inout(void) __arm_preserves("za") __arm_inout("za");

diff  --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
index 529d0d2d1e625e8..40254a5a0eafa37 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
@@ -12,7 +12,7 @@
 
 #include <arm_sme_draft_spec_subject_to_change.h>
 
-void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
   SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, slice, pg, ptr);
   // expected-error at +1 {{argument value 1 is outside the valid range [0, 0]}}
@@ -32,7 +32,7 @@ void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar
   SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, slice, pg, svundef_s8());
 }
 
-void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
   SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, slice, pg, ptr);
   // expected-error at +1 {{argument value 2 is outside the valid range [0, 1]}}
@@ -52,7 +52,7 @@ void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar
   SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, slice, pg, svundef_s16());
 }
 
-void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
   SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, slice, pg, ptr);
   // expected-error at +1 {{argument value 4 is outside the valid range [0, 3]}}
@@ -90,7 +90,7 @@ void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar
   SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8());
 }
 
-void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
   SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, slice, pg, ptr);
   // expected-error at +1 {{argument value 8 is outside the valid range [0, 7]}}
@@ -133,7 +133,7 @@ void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __ar
   SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64());
 }
 
-void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
   SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, slice, pg, ptr);
   // expected-error at +1 {{argument value 16 is outside the valid range [0, 15]}}
@@ -153,14 +153,14 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __a
   SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, slice, pg, svundef_s8());
 }
 
-void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error at +1 {{argument value 256 is outside the valid range [0, 255]}}
   SVE_ACLE_FUNC(svzero_mask_za,,,)(256);
   // expected-error at +1 {{argument value 18446744073709551615 is outside the valid range [0, 255]}}
   SVE_ACLE_FUNC(svzero_mask_za,,,)(-1);
 }
 
-void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, pg, ptr);  // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}}
   SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}}
   SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64);  // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}}

diff  --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index 95bb6be2d2d3441..f1e858f819602ed 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -6,21 +6,21 @@
 #include <arm_sme_draft_spec_subject_to_change.h>
 
 __attribute__((target("sme")))
-void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   svld1_hor_za8(0, 0, pg, ptr);
 }
 
 __attribute__((target("arch=armv8-a+sme")))
-void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   svld1_hor_vnum_za32(0, 0, pg, ptr, 0);
 }
 
 __attribute__((target("+sme")))
-void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_shared_za {
+void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   svst1_ver_za16(0, 0, pg, ptr);
 }
 
 __attribute__((target("+sme")))
-void undefined(svbool_t pg, void *ptr) __arm_shared_za {
+void undefined(svbool_t pg, void *ptr) __arm_inout("za") {
   svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}}
 }

diff  --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 5118f743174c25e..36103480861c3f6 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -5,7 +5,7 @@
 
 #include <arm_sme_draft_spec_subject_to_change.h>
 
-void test_multivector_read(uint32_t base) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_multivector_read(uint32_t base) __arm_streaming __arm_in("za") {
 
   // Test Tile Range
   svread_hor_za8_u8_vg2(1, base); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
@@ -32,7 +32,7 @@ void test_multivector_read(uint32_t base) __arm_streaming __arm_shared_za __arm_
 void test_multivector_write(uint32_t base, svuint8x2_t v8x2, svuint8x4_t v8x4,
                             svuint16x2_t v16x2, svuint16x4_t v16x4,
                             svuint32x2_t v32x2, svuint32x4_t v32x4,
-                            svuint64x2_t v64x2, svuint64x4_t v64x4) __arm_streaming __arm_shared_za {
+                            svuint64x2_t v64x2, svuint64x4_t v64x4) __arm_streaming __arm_inout("za") {
 
   // Test Tile Range
   svwrite_hor_za8_u8_vg2(1, base, v8x2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
@@ -56,7 +56,7 @@ void test_multivector_write(uint32_t base, svuint8x2_t v8x2, svuint8x4_t v8x4,
   svwrite_ver_za64_u64_vg4(8, base, v64x4); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
 
-void test_outer_product(svbool_t pred, svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32) __arm_streaming __arm_shared_za {
+void test_outer_product(svbool_t pred, svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32) __arm_streaming __arm_inout("za") {
   // Test Tile Range
   svmopa_za32_u16_m(4, pred, pred, u16, u16); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
   svmopa_za32_s16_m(4, pred, pred, s16, s16); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
@@ -71,15 +71,15 @@ void test_outer_product(svbool_t pred, svint16_t s16, svuint16_t u16, svint32_t
   svbmops_za32_s32_m(4, pred, pred, s32, s32); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_ldr_zt(const void *const_base) __arm_streaming_compatible __arm_shared_za {
+void test_ldr_zt(const void *const_base) __arm_streaming_compatible __arm_inout("za") {
   svldr_zt(1, const_base); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
 }
 
-void test_str_zt(void *base) __arm_streaming_compatible __arm_shared_za __arm_preserves_za {
+void test_str_zt(void *base) __arm_streaming_compatible __arm_in("za") {
   svstr_zt(1, base);       // expected-error {{argument value 1 is outside the valid range [0, 0]}}
 }
 
-void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("za") {
   // Test Reg Offset
   svluti2_lane_zt_u8_x4(1, zn, 0);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -106,7 +106,7 @@ void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm
   svluti2_lane_zt_f32_x4(0, zn, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("za") {
   // Test Reg Offset
   svluti4_lane_zt_u16_x4(1, zn, 0);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -129,7 +129,7 @@ void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm
   svluti4_lane_zt_f32_x4(0, zn, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
 }
 
-void test_svluti2_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_svluti2_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   // Test Reg Offset
   svluti2_lane_zt_u8(1, zn_u8, 2);    // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -156,7 +156,7 @@ void test_svluti2_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm
   svluti2_lane_zt_f32(0, zn_u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
 }
 
-void test_svluti4_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_svluti4_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   // Test Reg Offset
   svluti4_lane_zt_u8(1, zn_u8, 2);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -183,7 +183,7 @@ void test_svluti4_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm
   svluti4_lane_zt_f32(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
 
-void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   // Test Reg Offset
   svluti2_lane_zt_u8_x2(1, zn_u8, 2);    // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -210,7 +210,7 @@ void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __
   svluti2_lane_zt_f32_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
 
-void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
+void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   // Test Reg Offset
   svluti4_lane_zt_u8_x2(1, zn_u8, 2);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -245,7 +245,7 @@ void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __
 void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8,
                                 svint16_t s16, svuint16_t u16, svint8x2_t s8x2,
                                 svuint8x2_t u8x2, svint16x2_t s16x2, svuint16x2_t u16x2,
-                                svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) __arm_streaming __arm_shared_za {
+                                svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) __arm_streaming __arm_inout("za") {
 
   svmla_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
   svmla_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
@@ -292,7 +292,7 @@ void test_vertical_dot_product(uint32_t base, svint16x2_t s16x2, svuint16x2_t u1
                                svfloat16x2_t f16x2, svbfloat16x2_t bf16x2,
                                svint16_t s16, svuint16_t u16,
                                svint8_t s8, svuint8_t u8,
-                               svfloat16_t f16, svbfloat16_t b16) __arm_streaming __arm_shared_za {
+                               svfloat16_t f16, svbfloat16_t b16) __arm_streaming __arm_inout("za") {
   // Test lane indices.
   svvdot_lane_za32_s16_vg1x2(base, s16x2, s16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
   svvdot_lane_za32_u16_vg1x2(base, u16x2, u16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
@@ -309,7 +309,7 @@ void test_vertical_dot_product(uint32_t base, svint16x2_t s16x2, svuint16x2_t u1
 void test_fdot_za32_bad_lane(uint32_t slice_base, svfloat16_t z_f16,
                              svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4,
                              svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2,
-                             svbfloat16x4_t z_bf16x4) __arm_streaming __arm_shared_za {
+                             svbfloat16x4_t z_bf16x4) __arm_streaming __arm_inout("za") {
   // 16-bit float
   svdot_lane_za32_f16_vg1x2(slice_base, z_f16x2, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
   svdot_lane_za32_f16_vg1x4(slice_base, z_f16x4, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
@@ -325,7 +325,7 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
                                     svint16x4_t z_s16x4, svuint8_t z_u8,
                                     svuint8x2_t z_u8x2, svuint8x4_t z_u8x4,
                                     svint8_t z_s8, svint8x2_t z_s8x2,
-                                    svint8x4_t z_s8x4) __arm_streaming __arm_shared_za {
+                                    svint8x4_t z_s8x4) __arm_streaming __arm_inout("za") {
   // Multi, indexed (unsigned)
   svdot_lane_za32_u16_vg1x2(slice_base, z_u16x2, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
   svdot_lane_za32_u16_vg1x4(slice_base, z_u16x4, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}

diff  --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 4ec00573e8a9da8..c35490cf2e5aebb 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3547,23 +3547,27 @@ static void GenerateHasAttrSpellingStringSwitch(
   OS << "    .Default(0);\n";
 }
 
-// Emits the list of tokens for regular keyword attributes.
-void EmitClangAttrTokenKinds(RecordKeeper &Records, raw_ostream &OS) {
-  emitSourceFileHeader("A list of tokens generated from the attribute"
-                       " definitions",
-                       OS);
+// Emits list of regular keyword attributes with info about their arguments.
+void EmitClangRegularKeywordAttributeInfo(RecordKeeper &Records,
+                                          raw_ostream &OS) {
+  emitSourceFileHeader(
+      "A list of regular keyword attributes generated from the attribute"
+      " definitions",
+      OS);
   // Assume for now that the same token is not used in multiple regular
   // keyword attributes.
   for (auto *R : Records.getAllDerivedDefinitions("Attr"))
-    for (const auto &S : GetFlattenedSpellings(*R))
-      if (isRegularKeywordAttribute(S)) {
-        if (!R->getValueAsListOfDefs("Args").empty())
-          PrintError(R->getLoc(),
-                     "RegularKeyword attributes with arguments are not "
-                     "yet supported");
-        OS << "KEYWORD_ATTRIBUTE("
-           << S.getSpellingRecord().getValueAsString("Name") << ")\n";
-      }
+    for (const auto &S : GetFlattenedSpellings(*R)) {
+      if (!isRegularKeywordAttribute(S))
+        continue;
+      std::vector<Record *> Args = R->getValueAsListOfDefs("Args");
+      bool HasArgs = llvm::any_of(
+          Args, [](const Record *Arg) { return !Arg->getValueAsBit("Fake"); });
+
+      OS << "KEYWORD_ATTRIBUTE("
+         << S.getSpellingRecord().getValueAsString("Name") << ", "
+         << (HasArgs ? "true" : "false") << ")\n";
+    }
   OS << "#undef KEYWORD_ATTRIBUTE\n";
 }
 

diff  --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 5de2223e71b046b..060d79a06af88d2 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1619,7 +1619,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "}\n\n";
 
   OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) "
-        "__arm_streaming_compatible __arm_shared_za "
+        "__arm_streaming_compatible __arm_out(\"za\") "
         "{ }\n\n";
 
   createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME);

diff  --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 3859555d647fd14..158d10e2b3d6bc7 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -37,7 +37,7 @@ enum ActionType {
   GenClangAttrSubjectMatchRuleList,
   GenClangAttrPCHRead,
   GenClangAttrPCHWrite,
-  GenClangAttrTokenKinds,
+  GenClangRegularKeywordAttributeInfo,
   GenClangAttrHasAttributeImpl,
   GenClangAttrSpellingListIndex,
   GenClangAttrASTVisitor,
@@ -150,8 +150,10 @@ cl::opt<ActionType> Action(
                    "Generate clang PCH attribute reader"),
         clEnumValN(GenClangAttrPCHWrite, "gen-clang-attr-pch-write",
                    "Generate clang PCH attribute writer"),
-        clEnumValN(GenClangAttrTokenKinds, "gen-clang-attr-token-kinds",
-                   "Generate a list of attribute-related clang tokens"),
+        clEnumValN(GenClangRegularKeywordAttributeInfo,
+                   "gen-clang-regular-keyword-attr-info",
+                   "Generate a list of regular keyword attributes with info "
+                   "about their arguments"),
         clEnumValN(GenClangAttrHasAttributeImpl,
                    "gen-clang-attr-has-attribute-impl",
                    "Generate a clang attribute spelling list"),
@@ -291,11 +293,14 @@ cl::opt<ActionType> Action(
                    "Generate riscv_vector_builtin_cg.inc for clang"),
         clEnumValN(GenRISCVVectorBuiltinSema, "gen-riscv-vector-builtin-sema",
                    "Generate riscv_vector_builtin_sema.inc for clang"),
-        clEnumValN(GenRISCVSiFiveVectorBuiltins, "gen-riscv-sifive-vector-builtins",
+        clEnumValN(GenRISCVSiFiveVectorBuiltins,
+                   "gen-riscv-sifive-vector-builtins",
                    "Generate riscv_sifive_vector_builtins.inc for clang"),
-        clEnumValN(GenRISCVSiFiveVectorBuiltinCG, "gen-riscv-sifive-vector-builtin-codegen",
+        clEnumValN(GenRISCVSiFiveVectorBuiltinCG,
+                   "gen-riscv-sifive-vector-builtin-codegen",
                    "Generate riscv_sifive_vector_builtin_cg.inc for clang"),
-        clEnumValN(GenRISCVSiFiveVectorBuiltinSema, "gen-riscv-sifive-vector-builtin-sema",
+        clEnumValN(GenRISCVSiFiveVectorBuiltinSema,
+                   "gen-riscv-sifive-vector-builtin-sema",
                    "Generate riscv_sifive_vector_builtin_sema.inc for clang"),
         clEnumValN(GenAttrDocs, "gen-attr-docs",
                    "Generate attribute documentation"),
@@ -355,8 +360,8 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenClangAttrPCHWrite:
     EmitClangAttrPCHWrite(Records, OS);
     break;
-  case GenClangAttrTokenKinds:
-    EmitClangAttrTokenKinds(Records, OS);
+  case GenClangRegularKeywordAttributeInfo:
+    EmitClangRegularKeywordAttributeInfo(Records, OS);
     break;
   case GenClangAttrHasAttributeImpl:
     EmitClangAttrHasAttrImpl(Records, OS);

diff  --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index faa0c5d2cff9e3c..58a4af3c23a6740 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -53,8 +53,8 @@ void EmitClangAttrSubjectMatchRuleList(llvm::RecordKeeper &Records,
                                        llvm::raw_ostream &OS);
 void EmitClangAttrPCHRead(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitClangAttrPCHWrite(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangAttrTokenKinds(llvm::RecordKeeper &Records,
-                             llvm::raw_ostream &OS);
+void EmitClangRegularKeywordAttributeInfo(llvm::RecordKeeper &Records,
+                                          llvm::raw_ostream &OS);
 void EmitClangAttrHasAttrImpl(llvm::RecordKeeper &Records,
                               llvm::raw_ostream &OS);
 void EmitClangAttrSpellingListIndex(llvm::RecordKeeper &Records,