[llvm] [AArch64] Model ZA array using inaccessible memory (PR #132058)

Fri Apr 11 08:21:13 PDT 2025

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/132058

>From 1e85f2acb77d83678c21ad72f2d348f487f0518d Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 19 Mar 2025 15:35:59 +0000
Subject: [PATCH 1/4] [AArch64] Model ZA array using inaccessible memory

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  80 ++--
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  39 +-
 .../AArch64/sme-intrinsics-mova-extract.ll    | 259 ++++++++-----
 .../AArch64/sme2-intrinsics-extract-mova.ll   | 362 ++++++++++++------
 4 files changed, 483 insertions(+), 257 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6dfc3c8f2a393..7648fc55d54ae 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2940,7 +2940,7 @@ def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
 let TargetPrefix = "aarch64" in {
   class SME_Load_Store_Intrinsic<LLVMType pred_ty>
     : DefaultAttrsIntrinsic<[],
-        [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+        [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>]>;
 
   // Loads
   def int_aarch64_sme_ld1b_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
@@ -2968,18 +2968,18 @@ let TargetPrefix = "aarch64" in {
 
   // Spill + fill
   class SME_LDR_STR_ZA_Intrinsic
-    : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
+    : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly]>;
   def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic;
   def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic;
 
   class SME_TileToVector_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
           [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-           llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+           llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
   class SME_VectorToTile_Intrinsic
       : DefaultAttrsIntrinsic<[],
           [llvm_i32_ty, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-           llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+           llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_read_horiz  : SME_TileToVector_Intrinsic;
   def int_aarch64_sme_read_vert   : SME_TileToVector_Intrinsic;
@@ -2994,13 +2994,13 @@ let TargetPrefix = "aarch64" in {
   class SME_MOVAZ_TileToVector_X2_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
           [llvm_i32_ty, llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   class SME_MOVAZ_TileToVector_X4_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
            LLVMMatchType<0>,LLVMMatchType<0>],
           [llvm_i32_ty, llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
   def int_aarch64_sme_readz_vert_x2  : SME_MOVAZ_TileToVector_X2_Intrinsic;
@@ -3011,7 +3011,7 @@ let TargetPrefix = "aarch64" in {
   class SME_MOVAZ_TileToVector_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
           [llvm_i32_ty, llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_readz_horiz : SME_MOVAZ_TileToVector_Intrinsic;
   def int_aarch64_sme_readz_vert  : SME_MOVAZ_TileToVector_Intrinsic;
@@ -3022,12 +3022,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_readz_x2
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
           [llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects]>;
+          [IntrInaccessibleMemOnly]>;
 
   def int_aarch64_sme_readz_x4
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
           [llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects]>;
+          [IntrInaccessibleMemOnly]>;
 
   def int_aarch64_sme_write_lane_zt
        :  DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty],
@@ -3038,7 +3038,7 @@ let TargetPrefix = "aarch64" in {
             [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
 
 
-  def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+  def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
   def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
 
   class SME_OuterProduct_Intrinsic
@@ -3047,7 +3047,7 @@ let TargetPrefix = "aarch64" in {
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
            LLVMMatchType<0>,
-           llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+           llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
   def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
@@ -3069,7 +3069,7 @@ let TargetPrefix = "aarch64" in {
           [llvm_i32_ty,
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-           llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+           llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_addha : SME_AddVectorToTile_Intrinsic;
   def int_aarch64_sme_addva : SME_AddVectorToTile_Intrinsic;
@@ -3189,56 +3189,56 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_Single_Index_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                 llvm_anyvector_ty,
                 LLVMMatchType<0>, llvm_i32_ty],
-                [ImmArg<ArgIndex<3>>]>;
+                [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
 
   class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, llvm_i32_ty],
-                [ImmArg<ArgIndex<4>>]>;
+                [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
 
   class SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, llvm_i32_ty],
-                [ImmArg<ArgIndex<6>>]>;
+                [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
 
   class SME2_VG2_Multi_Imm_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
@@ -3257,14 +3257,14 @@ let TargetPrefix = "aarch64" in {
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3353,50 +3353,50 @@ let TargetPrefix = "aarch64" in {
   class SME2_ZA_ArrayVector_Read_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_ArrayVector_Read_VG4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                 [llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_TileVector_Read_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [llvm_i32_ty, llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_TileVector_Read_VG4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                 [llvm_i32_ty, llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_ArrayVector_Write_VG2_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_ArrayVector_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_TileVector_Write_VG2_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty, llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               [ImmArg<ArgIndex<0>>]>;
+               [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   class SME2_Matrix_TileVector_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty, llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               [ImmArg<ArgIndex<0>>]>;
+               [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   class SME2_VG2_Multi_Single_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3562,7 +3562,7 @@ let TargetPrefix = "aarch64" in {
   // Multi-vector zeroing
 
   foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
-    def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrNoMem, IntrHasSideEffects]>;
+    def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrWriteMem, IntrInaccessibleMemOnly]>;
   }
   
   // Multi-vector signed saturating doubling multiply high
@@ -4002,57 +4002,57 @@ let TargetPrefix = "aarch64" in {
           [llvm_i32_ty,
           llvm_nxv16i1_ty, llvm_nxv16i1_ty,
           llvm_nxv16i8_ty, llvm_nxv16i8_ty],
-          [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+          [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_LANE_VGx1_Intrinsic
    : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                llvm_nxv16i8_ty,
                                llvm_nxv16i8_ty,
                                llvm_i32_ty],
-                          [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<3>>]>;
+                          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
 
   class SME_FP8_ZA_LANE_VGx2_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty,
                                 llvm_i32_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
+                            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
 
   class SME_FP8_ZA_LANE_VGx4_Intrinsic
    : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty,
                                 llvm_i32_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
+                            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
   class SME_FP8_ZA_SINGLE_VGx1_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_SINGLE_VGx2_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_SINGLE_VGx4_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty],
-                              [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                              [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_MULTI_VGx2_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_MULTI_VGx4_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
   //
   // CVT from FP8 to half-precision/BFloat16 multi-vector
   //
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4f6a413ba5e5c..ff850751acf48 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -102,6 +102,8 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
   // Translated to the actual instructions in AArch64ISelLowering.cpp
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -110,6 +112,8 @@ class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, Regis
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -118,6 +122,8 @@ class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, Regist
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -126,6 +132,8 @@ class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, Regist
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, imm_ty:$i), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -133,6 +141,7 @@ class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -140,6 +149,7 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
       Pseudo<(outs), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum za_flag>
@@ -147,6 +157,7 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
@@ -154,6 +165,8 @@ class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, R
       Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -162,6 +175,8 @@ class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOpe
       Pseudo<(outs multi_vector_ty:$Zd), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -665,6 +680,8 @@ class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
   // Translated to the actual instructions in AArch64ISelLowering.cpp
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 multiclass sme_add_vector_to_tile_u32<bit V, string mnemonic, SDPatternOperator op> {
@@ -1123,6 +1140,7 @@ class sme_mova_insert_pseudo<SMEMatrixTypeEnum za_flag>
   // Translated to the actual instructions in AArch64ISelLowering.cpp
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
@@ -1317,6 +1335,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
                                    is_col, sme_elm_idx0_15, mnemonic> {
     bits<4> imm;
     let Inst{8-5} = imm;
+    let mayLoad = 1;
   }
   def _H : sme_tile_to_vector_inst<0b0, 0b01, ZPR16, !if(is_col, TileVectorOpV16,
                                                                  TileVectorOpH16),
@@ -1325,6 +1344,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
     bits<3> imm;
     let Inst{8}   = ZAn;
     let Inst{7-5} = imm;
+    let mayLoad = 1;
   }
   def _S : sme_tile_to_vector_inst<0b0, 0b10, ZPR32, !if(is_col, TileVectorOpV32,
                                                                  TileVectorOpH32),
@@ -1333,6 +1353,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
     bits<2> imm;
     let Inst{8-7} = ZAn;
     let Inst{6-5} = imm;
+    let mayLoad = 1;
   }
   def _D : sme_tile_to_vector_inst<0b0, 0b11, ZPR64, !if(is_col, TileVectorOpV64,
                                                                  TileVectorOpH64),
@@ -1341,12 +1362,14 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
     bits<1> imm;
     let Inst{8-6} = ZAn;
     let Inst{5}   = imm;
+    let mayLoad = 1;
   }
   def _Q : sme_tile_to_vector_inst<0b1, 0b11, ZPR128, !if(is_col, TileVectorOpV128,
                                                                   TileVectorOpH128),
                                    is_col, sme_elm_idx0_0, mnemonic> {
     bits<4> ZAn;
     let Inst{8-5} = ZAn;
+    let mayLoad = 1;
   }
 
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _B), ZPR8,
@@ -1817,7 +1840,9 @@ multiclass sme2_multivec_accum_add_sub_vg2<string mnemonic, bits<4> op,
   def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
   (!cast<Instruction>(NAME) matrix_ty:$ZAdn,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
 
-  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+    let mayLoad = 1;
+  }
   def : SME2_ZA_VG1x2_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
 }
 
@@ -1840,7 +1865,9 @@ multiclass sme2_multivec_accum_add_sub_vg4<string mnemonic, bits<4> op,
   def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
   (!cast<Instruction>(NAME) matrix_ty:$ZAdn,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
 
-  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+    let mayLoad = 1;
+  }
   def : SME2_ZA_VG1x4_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
 }
 
@@ -4349,6 +4376,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
                                                  uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<3> imm;
     let Inst{7-5} = imm;
+    let mayLoad = 1;
   }
 
   def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r,
@@ -4359,6 +4387,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
     bits<2> imm;
     let Inst{7}   = ZAn;
     let Inst{6-5} = imm;
+    let mayLoad = 1;
   }
 
   def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r,
@@ -4369,6 +4398,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
     bits<1> imm;
     let Inst{7-6} = ZAn;
     let Inst{5}   = imm;
+    let mayLoad = 1;
   }
 
   def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r,
@@ -4377,6 +4407,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
                                                 uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     let Inst{7-5} = ZAn;
+    let mayLoad = 1;
   }
 
   if !eq(mnemonic, "mova") then {
@@ -4491,6 +4522,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<2> imm;
     let Inst{6-5} = imm;
+    let mayLoad = 1;
   }
 
   def _H : sme2_mova_tile_to_vec_vg4_multi_base<0b01, v, {opc,0,?,?},
@@ -4502,6 +4534,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
     bits<1> imm;
     let Inst{6}   = ZAn;
     let Inst{5}   = imm;
+    let mayLoad = 1;
   }
 
   def _S : sme2_mova_tile_to_vec_vg4_multi_base<0b10, v, {opc,0,?,?},
@@ -4511,6 +4544,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                  uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
     bits<2> ZAn;
     let Inst{6-5} = ZAn;
+    let mayLoad = 1;
   }
 
   def _D : sme2_mova_tile_to_vec_vg4_multi_base<0b11, v, {opc,?,?,?},
@@ -4520,6 +4554,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     let Inst{7-5} = ZAn;
+    let mayLoad = 1;
   }
 
   if !eq(mnemonic, "mova") then {
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 48fbd14bd8540..5370b3e9dc9df 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -1,27 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
-define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_b:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.b, p0/m, za0h.b[w12, 0]
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 2]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.b, p0/m, za0h.b[w12, 0]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.b, p0/m, za0h.b[w12, 2]
 ; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 4]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 6]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 8]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 10]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 12]
-; CHECK-NEXT:    mov z0.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.b, p0/m, za0h.b[w12, 6]
+; CHECK-NEXT:    mov z4.b, p0/m, za0h.b[w12, 8]
+; CHECK-NEXT:    mov z5.b, p0/m, za0h.b[w12, 10]
+; CHECK-NEXT:    mov z6.b, p0/m, za0h.b[w12, 12]
+; CHECK-NEXT:    mov z7.b, p0/m, za0h.b[w12, 14]
+; CHECK-NEXT:    b dummy_use_8_nxv16i8
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -37,30 +36,33 @@ define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x
   %z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.12)
   %tileslice.14 = add i32 %tileslice, 14
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
-  ret <vscale x 16 x i8> %z0
+
+  ; Force retention of z0..z7
+  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  ret void
 }
 
-define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_b:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.b, p0/m, za0v.b[w12, 1]
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 3]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.b, p0/m, za0v.b[w12, 1]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.b, p0/m, za0v.b[w12, 3]
 ; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 5]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 7]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 9]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 11]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 13]
-; CHECK-NEXT:    mov z0.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.b, p0/m, za0v.b[w12, 7]
+; CHECK-NEXT:    mov z4.b, p0/m, za0v.b[w12, 9]
+; CHECK-NEXT:    mov z5.b, p0/m, za0v.b[w12, 11]
+; CHECK-NEXT:    mov z6.b, p0/m, za0v.b[w12, 13]
+; CHECK-NEXT:    mov z7.b, p0/m, za0v.b[w12, 15]
+; CHECK-NEXT:    b dummy_use_8_nxv16i8
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -77,22 +79,24 @@ define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x
   %z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.13)
   %tileslice.15 = add i32 %tileslice, 15
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
-  ret <vscale x 16 x i8> %z0
+
+  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  ret void
 }
 
-define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_h:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 2]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 2]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 6]
+; CHECK-NEXT:    b dummy_use_4_nxv8i16
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -100,22 +104,23 @@ define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
   %z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
   %tileslice.6 = add i32 %tileslice, 6
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
-  ret <vscale x 8 x i16> %z0
+
+  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  ret void
 }
 
-define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_h:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 1]
-; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 3]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za1v.h[w12, 1]
+; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 3]
 ; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 5]
-; CHECK-NEXT:    mov z0.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za1v.h[w12, 7]
+; CHECK-NEXT:    b dummy_use_4_nxv8i16
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -124,30 +129,31 @@ define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
   %z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.5)
   %tileslice.7 = add i32 %tileslice, 7
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
-  ret <vscale x 8 x i16> %z0
+
+  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  ret void
 }
 
-define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT:    mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT:    mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT:    b dummy_use_8_nxv8f16
   %z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -163,30 +169,32 @@ define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i
   %z6 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
-  ret <vscale x 8 x half> %z0
+
+  tail call void @dummy_use_8_nxv8f16(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+                                 <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+  ret void
 }
 
-define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
+define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
 ; CHECK-LABEL: extract_bf16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT:    mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT:    mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT:    b dummy_use_8_nxv8bf16
   %z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -202,53 +210,57 @@ define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x
   %z6 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
-  ret <vscale x 8 x bfloat> %z0
+
+  tail call void @dummy_use_8_nxv8bf16(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+                                  <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+  ret void
 }
 
-define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 2]
+; CHECK-NEXT:    b dummy_use_2_nxv4i32
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
-  ret <vscale x 4 x i32> %z0
+
+  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  ret void
 }
 
-define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 1]
-; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 1]
+; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 3]
+; CHECK-NEXT:    b dummy_use_2_nxv4i32
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
-  ret <vscale x 4 x i32> %z0
+
+  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  ret void
 }
 
-define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT:    mov z2.s, p0/m, za0h.s[w12, 1]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 1]
 ; CHECK-NEXT:    mov z2.s, p0/m, za0v.s[w12, 2]
-; CHECK-NEXT:    mov z0.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.s, p0/m, za0v.s[w12, 3]
+; CHECK-NEXT:    b dummy_use_4_nxv4f32
   %z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -256,7 +268,9 @@ define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x
   %z2 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
   %tileslice.3 = add i32 %tileslice, 3
   %z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
-  ret <vscale x 4 x float> %z0
+
+  tail call void @dummy_use_4_nxv4f32(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+  ret void
 }
 
 define <vscale x 2 x i64> @extract_row_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
@@ -280,19 +294,20 @@ define <vscale x 2 x i64> @extract_col_d(<vscale x 2 x i64> %zd, <vscale x 2 x i
   ret <vscale x 2 x i64> %z0
 }
 
-define <vscale x 2 x double> @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
+define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.d, p0/m, za0h.d[w12, 0]
-; CHECK-NEXT:    mov z0.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z0.d, p0/m, za0h.d[w12, 0]
+; CHECK-NEXT:    mov z1.d, p0/m, za0v.d[w12, 1]
+; CHECK-NEXT:    b dummy_use_2_nxv2f64
   %z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
-  ret <vscale x 2 x double> %z0
+
+  tail call void @dummy_use_2_nxv2f64(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+  ret void
 }
 
 define <vscale x 16 x i8> @extract_row_q_v16i18(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
@@ -507,3 +522,35 @@ declare <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i3
 declare <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
 declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
+
+; ------------------------------------------------------------------------------
+; Dummy external functions to force code retention.
+; The compiler does not see their implementations, so it must keep the calls.
+; ------------------------------------------------------------------------------
+
+declare void @dummy_use_8_nxv16i8(
+  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>
+)
+
+declare void @dummy_use_4_nxv8i16(
+  <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>
+)
+
+declare void @dummy_use_8_nxv8f16(
+  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>
+)
+
+declare void @dummy_use_8_nxv8bf16(
+  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>
+)
+
+declare void @dummy_use_2_nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare void @dummy_use_4_nxv4f32(
+  <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>
+)
+
+declare void @dummy_use_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
index ca5399a0503e9..c01c96cc56975 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-b16b16 -verify-machineinstrs -force-streaming < %s | FileCheck %s
 
 ;
 ; Move Multi-Vector From Tile (Read) x2
@@ -7,82 +7,106 @@
 
 ; Horizontal
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg2_b(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 0:1]
-; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT:    mov { z2.b, z3.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
   %slice.14 = add i32 %slice, 14
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg2_h(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg2_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg2_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_horiz_vg2_s(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum     = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_horiz_vg2_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum     = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
@@ -107,82 +131,106 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i
 
 ; Vertical
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg2_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 0:1]
-; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT:    mov { z2.b, z3.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
   %slice.14 = add i32 %slice, 14
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg2_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg2_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg2_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vert_vg2_s(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum     = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vert_vg2_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum     = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
@@ -211,56 +259,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i3
 
 ; Horizontal
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg4_b(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 0:3]
-; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT:    mov { z4.b - z7.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT:    add z0.b, z0.b, z4.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
   %slice.12 = add i32 %slice, 12
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg4_h(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg4_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg4_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
@@ -305,56 +369,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 
 ; Vertical
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg4_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 0:3]
-; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT:    mov { z4.b - z7.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT:    add z0.b, z0.b, z4.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
   %slice.12 = add i32 %slice, 12
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg4_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg4_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg4_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
@@ -399,214 +479,278 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 
 ; Move Multi-Vector From ZA (Read) x2
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x2_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice.7)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x2_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice.7)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum      = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x2_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice.7)
-  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum      = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x2_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice.7)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum      = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x2_s(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice.7)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum      = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x2_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice.7)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum      = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x2_d(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
-  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+  %val1     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+  %sum      = add <vscale x 2 x i64> %val1, %val2
+  ret <vscale x 2 x i64> %sum
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x2_f64(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
-  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
+  %val1     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+  %sum      = fadd <vscale x 2 x double> %val1, %val2
+  ret <vscale x 2 x double> %sum
 }
 
 ; Move Multi-Vector From ZA (Read) x4
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x4_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.b, z0.b, z4.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice.7)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x4_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice.7)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum      = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x4_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    fadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice.7)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum      = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x4_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice.7)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum      = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x4_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x4_s(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.s, z0.s, z4.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice.7)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum      = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x4_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x4_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    fadd z0.s, z0.s, z4.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice.7)
-  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum      = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x4_d(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.d, z0.d, z4.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
-  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+  %val1     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+  %sum      = add <vscale x 2 x i64> %val1, %val2
+  ret <vscale x 2 x i64> %sum
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x4_f64(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    fadd z0.d, z0.d, z4.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
-  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
+  %val1     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+  %sum      = fadd <vscale x 2 x double> %val1, %val2
+  ret <vscale x 2 x double> %sum
 }
 
 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)

>From 9077c3a872e3b718c9b33b2a4f6e94fcc723bdfa Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Tue, 8 Apr 2025 16:57:03 +0000
Subject: [PATCH 2/4] Simplify dummy functions

---
 .../AArch64/sme-intrinsics-mova-extract.ll    | 109 ++++++++----------
 1 file changed, 48 insertions(+), 61 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 5370b3e9dc9df..4324b6ec64f25 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -20,7 +20,7 @@ define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
 ; CHECK-NEXT:    mov z5.b, p0/m, za0h.b[w12, 10]
 ; CHECK-NEXT:    mov z6.b, p0/m, za0h.b[w12, 12]
 ; CHECK-NEXT:    mov z7.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT:    b dummy_use_8_nxv16i8
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -38,8 +38,8 @@ define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
 
   ; Force retention of z0..z7
-  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
-                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                      <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
   ret void
 }
 
@@ -62,7 +62,7 @@ define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
 ; CHECK-NEXT:    mov z5.b, p0/m, za0v.b[w12, 11]
 ; CHECK-NEXT:    mov z6.b, p0/m, za0v.b[w12, 13]
 ; CHECK-NEXT:    mov z7.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT:    b dummy_use_8_nxv16i8
+; CHECK-NEXT:    b use
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -80,8 +80,8 @@ define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
   %tileslice.15 = add i32 %tileslice, 15
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
 
-  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
-                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                      <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
   ret void
 }
 
@@ -96,7 +96,7 @@ define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 2]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
 ; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT:    b dummy_use_4_nxv8i16
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -105,7 +105,7 @@ define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
   %tileslice.6 = add i32 %tileslice, 6
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
 
-  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
   ret void
 }
 
@@ -120,7 +120,7 @@ define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 3]
 ; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 5]
 ; CHECK-NEXT:    mov z3.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT:    b dummy_use_4_nxv8i16
+; CHECK-NEXT:    b use
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -130,7 +130,7 @@ define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
   %tileslice.7 = add i32 %tileslice, 7
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
 
-  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
   ret void
 }
 
@@ -153,7 +153,7 @@ define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %ti
 ; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
 ; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
 ; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    b dummy_use_8_nxv8f16
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -170,8 +170,8 @@ define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %ti
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
 
-  tail call void @dummy_use_8_nxv8f16(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
-                                 <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+  tail call void @use(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+                      <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
   ret void
 }
 
@@ -194,7 +194,7 @@ define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32
 ; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
 ; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
 ; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    b dummy_use_8_nxv8bf16
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -211,8 +211,8 @@ define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
 
-  tail call void @dummy_use_8_nxv8bf16(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
-                                  <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+  tail call void @use(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+                      <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
   ret void
 }
 
@@ -223,12 +223,12 @@ define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
 ; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT:    b dummy_use_2_nxv4i32
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
 
-  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
   ret void
 }
 
@@ -239,13 +239,13 @@ define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 1]
 ; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT:    b dummy_use_2_nxv4i32
+; CHECK-NEXT:    b use
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
 
-  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
   ret void
 }
 
@@ -260,7 +260,7 @@ define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 1]
 ; CHECK-NEXT:    mov z2.s, p0/m, za0v.s[w12, 2]
 ; CHECK-NEXT:    mov z3.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT:    b dummy_use_4_nxv4f32
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -269,7 +269,7 @@ define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %t
   %tileslice.3 = add i32 %tileslice, 3
   %z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
 
-  tail call void @dummy_use_4_nxv4f32(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+  tail call void @use(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
   ret void
 }
 
@@ -301,12 +301,12 @@ define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.d, p0/m, za0h.d[w12, 0]
 ; CHECK-NEXT:    mov z1.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT:    b dummy_use_2_nxv2f64
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
 
-  tail call void @dummy_use_2_nxv2f64(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+  tail call void @use(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
   ret void
 }
 
@@ -453,21 +453,33 @@ define <vscale x 2 x double> @extract_col_q_v2f64(<vscale x 2 x double> %zd, <vs
 define <vscale x 4 x i32> @test_sink_offset_operand(<vscale x 4 x i1> %pg, i32 %base, i32 %N) {
 ; CHECK-LABEL: test_sink_offset_operand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    mov z3.s, #0 // =0x0
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:  .LBB26_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    subs w1, w1, #3
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT:    mov z2.s, p0/m, za0h.s[w12, 1]
-; CHECK-NEXT:    mov z3.s, p0/m, za0h.s[w12, 2]
+; CHECK-NEXT:    mov z2.d, z3.d
+; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 1]
+; CHECK-NEXT:    mov z2.s, p0/m, za0h.s[w12, 2]
 ; CHECK-NEXT:    b.ne .LBB26_1
 ; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    add z0.s, z1.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-NEXT:    add z3.s, z0.s, z1.s
+; CHECK-NEXT:    add z8.s, z3.s, z2.s
+; CHECK-NEXT:    bl use
+; CHECK-NEXT:    mov z0.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %add1 = add i32 %base, 1
@@ -486,6 +498,7 @@ for.body:
 exit:
   %tmp1 = add <vscale x 4 x i32> %z0, %z1
   %res = add <vscale x 4 x i32> %tmp1, %z2
+  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
   ret <vscale x 4 x i32> %res
 }
 
@@ -524,33 +537,7 @@ declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i6
 declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
 
 ; ------------------------------------------------------------------------------
-; Dummy external functions to force code retention.
-; The compiler does not see their implementations, so it must keep the calls.
+; Dummy external function to force code retention.
 ; ------------------------------------------------------------------------------
 
-declare void @dummy_use_8_nxv16i8(
-  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
-  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>
-)
-
-declare void @dummy_use_4_nxv8i16(
-  <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>
-)
-
-declare void @dummy_use_8_nxv8f16(
-  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
-  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>
-)
-
-declare void @dummy_use_8_nxv8bf16(
-  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
-  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>
-)
-
-declare void @dummy_use_2_nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-
-declare void @dummy_use_4_nxv4f32(
-  <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>
-)
-
-declare void @dummy_use_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare void @use(...)

>From b0900684e2fde319034a415c2c499e98353b2821 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 10 Apr 2025 13:26:18 +0000
Subject: [PATCH 3/4] Rename SME classes not accessing ZA to SVE

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td | 201 +++++++++++-----------
 1 file changed, 99 insertions(+), 102 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7648fc55d54ae..222a20d9e4044 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3098,9 +3098,9 @@ let TargetPrefix = "aarch64" in {
                               [IntrNoMem, IntrHasSideEffects]>;
 
   def int_aarch64_sme_za_enable
-      : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+      : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
   def int_aarch64_sme_za_disable
-      : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+      : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   // Clamp
   //
@@ -3240,13 +3240,13 @@ let TargetPrefix = "aarch64" in {
                  LLVMMatchType<0>, llvm_i32_ty],
                 [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
 
-  class SME2_VG2_Multi_Imm_Intrinsic
+  class SVE2_VG2_Multi_Imm_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                 [llvm_anyvector_ty, LLVMMatchType<0>,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  class SME2_VG4_Multi_Imm_Intrinsic
+  class SVE2_VG4_Multi_Imm_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
                 [llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3266,13 +3266,13 @@ let TargetPrefix = "aarch64" in {
                 LLVMMatchType<0>,  LLVMMatchType<0>],
                [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
-  class SME2_VG2_Multi_Single_Intrinsic
+  class SVE2_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
-  class SME2_VG4_Multi_Single_Intrinsic
+  class SVE2_VG4_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                             [LLVMMatchType<0>,  LLVMMatchType<0>,
@@ -3280,13 +3280,13 @@ let TargetPrefix = "aarch64" in {
                              LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_VG2_Multi_Multi_Intrinsic
+  class SVE2_VG2_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
 
-  class SME2_VG4_Multi_Multi_Intrinsic
+  class SVE2_VG4_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                             [LLVMMatchType<0>,  LLVMMatchType<0>,
@@ -3310,42 +3310,42 @@ let TargetPrefix = "aarch64" in {
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>], [IntrNoMem]>;
 
-  class SME2_CVT_VG2_SINGLE_Intrinsic
+  class SVE2_CVT_VG2_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_VG2_SINGLE_BF16_Intrinsic
+  class SVE2_CVT_VG2_SINGLE_BF16_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
   
-  class SME2_CVT_WIDENING_VG2_Intrinsic
+  class SVE2_CVT_WIDENING_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
   
 
-  class SME2_CVT_VG4_SINGLE_Intrinsic
+  class SVE2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_X2_Intrinsic
+  class SVE2_CVT_X2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<1>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_X4_Intrinsic
+  class SVE2_CVT_X4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>],
                             [IntrNoMem]>;
 
-  class SME2_BFMLS_Intrinsic
+  class SVE2_BFMLS_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
                             [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
                             [IntrNoMem]>;
 
-  class SME2_BFMLS_Lane_Intrinsic
+  class SVE2_BFMLS_Lane_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
                             [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<3>>]>;
@@ -3398,13 +3398,13 @@ let TargetPrefix = "aarch64" in {
                 LLVMMatchType<0>,  LLVMMatchType<0>],
                [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
-  class SME2_VG2_Multi_Single_Single_Intrinsic
+  class SVE2_VG2_Multi_Single_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
 
-  class SME2_VG4_Multi_Single_Single_Intrinsic
+  class SVE2_VG4_Multi_Single_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3422,11 +3422,11 @@ let TargetPrefix = "aarch64" in {
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
-  class SME2_VG2_Unpk_Intrinsic
+  class SVE2_VG2_Unpk_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
 
-  class SME2_VG4_Unpk_Intrinsic
+  class SVE2_VG4_Unpk_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
@@ -3467,33 +3467,33 @@ let TargetPrefix = "aarch64" in {
   // Multi-vector rounding shift left intrinsics
   //
 
-  def int_aarch64_sve_srshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_urshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_srshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
-  def int_aarch64_sve_urshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sve_srshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_urshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_srshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sve_urshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-  def int_aarch64_sve_srshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_urshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_srshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_urshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_srshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_urshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_srshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_urshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector saturating rounding shift right intrinsics
 
-  def int_aarch64_sve_sqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
-  def int_aarch64_sve_sqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
-  def int_aarch64_sve_sqrshru_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshru_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshru_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshru_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
-  def int_aarch64_sve_sqrshrun_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshrun_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrun_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrun_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
   //
   // Multi-vector multiply-add/subtract long
@@ -3553,25 +3553,23 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
   def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
-  def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic;
-  def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic;
+  def int_aarch64_sve_bfmlslb : SVE2_BFMLS_Intrinsic;
+  def int_aarch64_sve_bfmlslb_lane : SVE2_BFMLS_Lane_Intrinsic;
 
-  def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
-  def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
+  def int_aarch64_sve_bfmlslt : SVE2_BFMLS_Intrinsic;
+  def int_aarch64_sve_bfmlslt_lane : SVE2_BFMLS_Lane_Intrinsic;
 
   // Multi-vector zeroing
 
   foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
     def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrWriteMem, IntrInaccessibleMemOnly]>;
   }
-  
   // Multi-vector signed saturating doubling multiply high
+  def int_aarch64_sve_sqdmulh_single_vgx2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_sqdmulh_single_vgx4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-  def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_sqdmulh_single_vgx4 : SME2_VG4_Multi_Single_Intrinsic;
-
-  def int_aarch64_sve_sqdmulh_vgx2 : SME2_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_sqdmulh_vgx4 : SME2_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_sqdmulh_vgx2 : SVE2_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_sqdmulh_vgx4 : SVE2_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector floating-point round to integral value
 
@@ -3586,11 +3584,11 @@ let TargetPrefix = "aarch64" in {
 
   foreach ty = ["f", "s", "u"] in {
     foreach instr = ["max", "min"] in {
-      def int_aarch64_sve_ # ty # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-      def int_aarch64_sve_ # ty # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-      def int_aarch64_sve_ # ty # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
-      def int_aarch64_sve_ # ty # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
     }
   }
 
@@ -3599,11 +3597,11 @@ let TargetPrefix = "aarch64" in {
   //
 
   foreach instr = ["fmaxnm", "fminnm"] in {
-    def int_aarch64_sve_ # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-    def int_aarch64_sve_ # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+    def int_aarch64_sve_ # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+    def int_aarch64_sve_ # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-    def int_aarch64_sve_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
-    def int_aarch64_sve_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+    def int_aarch64_sve_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+    def int_aarch64_sve_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
   }
 
   //
@@ -3611,8 +3609,8 @@ let TargetPrefix = "aarch64" in {
   //
 
   foreach instr = ["famax", "famin"] in {
-    def int_aarch64_sme_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
-    def int_aarch64_sme_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+    def int_aarch64_sme_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+    def int_aarch64_sme_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
   }
 
   //
@@ -3634,48 +3632,47 @@ let TargetPrefix = "aarch64" in {
   //
   //Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
   //
-  
-  def int_aarch64_sve_fcvtl_widen_x2  : SME2_CVT_WIDENING_VG2_Intrinsic;
+  def int_aarch64_sve_fcvtl_widen_x2  : SVE2_CVT_WIDENING_VG2_Intrinsic;
 
   //
   // Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
   //
-  def int_aarch64_sve_fcvtn_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_bfcvtn_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
+  def int_aarch64_sve_fcvtn_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_bfcvtn_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
 
   //
   // Multi-vector convert to/from floating-point.
   //
-  def int_aarch64_sve_fcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
-  def int_aarch64_sve_fcvtzs_x2 : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_fcvtzu_x2 : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_scvtf_x2  : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_ucvtf_x2  : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_fcvtzs_x4 : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
+  def int_aarch64_sve_fcvt_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_bfcvt_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
+  def int_aarch64_sve_fcvtzs_x2 : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_fcvtzu_x2 : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_scvtf_x2  : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_ucvtf_x2  : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_fcvtzs_x4 : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_fcvtzu_x4 : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_scvtf_x4  : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_ucvtf_x4  : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_fcvt_widen_x2 : SVE2_CVT_WIDENING_VG2_Intrinsic;
   //
   // Multi-vector saturating extract narrow
   //
-  def int_aarch64_sve_sqcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtu_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvt_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvt_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtu_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvt_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvt_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtu_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvt_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvt_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtu_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
 
   //
   // Multi-vector saturating extract narrow and interleave
   //
-  def int_aarch64_sve_sqcvtn_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvtn_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtun_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtn_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvtn_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtun_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtn_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvtn_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtun_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtn_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvtn_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtun_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
 
   //
   // Multi-Single add/sub
@@ -3694,15 +3691,15 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector clamps
-  def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_uclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_fclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
 
-  def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_sclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_uclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_fclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
 
   //
   // Multi-vector add/sub and accumulate into ZA
@@ -3739,8 +3736,8 @@ let TargetPrefix = "aarch64" in {
   //
   // Multi-Single Vector add
   //
-  def int_aarch64_sve_add_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_add_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sve_add_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_add_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
   // 2-way and 4-way multi-vector signed/unsigned integer dot-product
   foreach ty = ["s", "u"] in {
@@ -3798,10 +3795,10 @@ let TargetPrefix = "aarch64" in {
   //
   // Signed/unsigned multi-vector unpacks
   //
-  def int_aarch64_sve_sunpk_x2 : SME2_VG2_Unpk_Intrinsic;
-  def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic;
-  def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic;
-  def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic;
+  def int_aarch64_sve_sunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+  def int_aarch64_sve_uunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+  def int_aarch64_sve_sunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
+  def int_aarch64_sve_uunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
 
   // 2-way and 4-way vector selects
   def int_aarch64_sve_sel_x2  : SVE2_VG2_Sel_Intrinsic;
@@ -4133,4 +4130,4 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2  : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
-}
+}
\ No newline at end of file

>From 77b2736152223b569cfb822cbd167e211d41afc3 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 11 Apr 2025 15:20:00 +0000
Subject: [PATCH 4/4] Fix incorrect properties for some ZA intrinsics

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td | 66 ++++++++++++++++-------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 222a20d9e4044..dff343b85ef51 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3257,14 +3257,14 @@ let TargetPrefix = "aarch64" in {
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               [IntrWriteMem, IntrInaccessibleMemOnly]>;
+               [IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               [IntrWriteMem, IntrInaccessibleMemOnly]>;
+               [IntrInaccessibleMemOnly]>;
 
   class SVE2_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3677,18 +3677,48 @@ let TargetPrefix = "aarch64" in {
   //
   // Multi-Single add/sub
   //
-  def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
-  def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+  class SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>,
+          LLVMMatchType<0>],
+          [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+  class SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+          LLVMMatchType<0>],
+        [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+  def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
 
   //
   // Multi-Multi add/sub
   //
-  def int_aarch64_sme_add_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sme_add_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
-  def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+  class SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>],
+          [IntrInaccessibleMemOnly, IntrWriteMem]>;
+          
+  class SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>],
+          [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+  def int_aarch64_sme_add_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_add_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector clamps
   def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
@@ -3984,12 +4014,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fp8_fmlalltt      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
 
-  class SME2_FP8_CVT_X2_Single_Intrinsic
+  class SVE2_FP8_CVT_X2_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [llvm_nxv16i8_ty],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
 
-  class SME2_FP8_CVT_Single_X4_Intrinsic
+  class SVE2_FP8_CVT_Single_X4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
@@ -4053,14 +4083,14 @@ let TargetPrefix = "aarch64" in {
   //
   // CVT from FP8 to half-precision/BFloat16 multi-vector
   //
-  def int_aarch64_sve_fp8_cvt1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
-  def int_aarch64_sve_fp8_cvt2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvt1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvt2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
 
   //
   // CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector
   //
-  def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
-  def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvtl1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvtl2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
 
   //
   // CVT to FP8 from half-precision/BFloat16/single-precision multi-vector
@@ -4070,8 +4100,8 @@ let TargetPrefix = "aarch64" in {
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
 
-  def int_aarch64_sve_fp8_cvt_x4  : SME2_FP8_CVT_Single_X4_Intrinsic;
-  def int_aarch64_sve_fp8_cvtn_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
+  def int_aarch64_sve_fp8_cvt_x4  : SVE2_FP8_CVT_Single_X4_Intrinsic;
+  def int_aarch64_sve_fp8_cvtn_x4 : SVE2_FP8_CVT_Single_X4_Intrinsic;
 
   // FP8 outer product
   def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic;