[llvm] [AArch64] Model ZA array using inaccessible memory (PR #132058)

Mon Apr 14 05:59:31 PDT 2025

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/132058

>From d27701bcc0d70d47fa9aaf6f51ecdb6744838abf Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 19 Mar 2025 15:35:59 +0000
Subject: [PATCH 1/5] [AArch64] Model ZA array using inaccessible memory

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  80 ++--
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  39 +-
 .../AArch64/sme-intrinsics-mova-extract.ll    | 259 ++++++++-----
 .../AArch64/sme2-intrinsics-extract-mova.ll   | 362 ++++++++++++------
 4 files changed, 483 insertions(+), 257 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 77ea0bcaa4b5f..7ced13c0c1962 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2940,7 +2940,7 @@ def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
 let TargetPrefix = "aarch64" in {
   class SME_Load_Store_Intrinsic<LLVMType pred_ty>
     : DefaultAttrsIntrinsic<[],
-        [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+        [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>]>;
 
   // Loads
   def int_aarch64_sme_ld1b_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
@@ -2968,18 +2968,18 @@ let TargetPrefix = "aarch64" in {
 
   // Spill + fill
   class SME_LDR_STR_ZA_Intrinsic
-    : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
+    : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly]>;
   def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic;
   def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic;
 
   class SME_TileToVector_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
           [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-           llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+           llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
   class SME_VectorToTile_Intrinsic
       : DefaultAttrsIntrinsic<[],
           [llvm_i32_ty, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-           llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+           llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_read_horiz  : SME_TileToVector_Intrinsic;
   def int_aarch64_sme_read_vert   : SME_TileToVector_Intrinsic;
@@ -2994,13 +2994,13 @@ let TargetPrefix = "aarch64" in {
   class SME_MOVAZ_TileToVector_X2_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
           [llvm_i32_ty, llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   class SME_MOVAZ_TileToVector_X4_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
            LLVMMatchType<0>,LLVMMatchType<0>],
           [llvm_i32_ty, llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
   def int_aarch64_sme_readz_vert_x2  : SME_MOVAZ_TileToVector_X2_Intrinsic;
@@ -3011,7 +3011,7 @@ let TargetPrefix = "aarch64" in {
   class SME_MOVAZ_TileToVector_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
           [llvm_i32_ty, llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_readz_horiz : SME_MOVAZ_TileToVector_Intrinsic;
   def int_aarch64_sme_readz_vert  : SME_MOVAZ_TileToVector_Intrinsic;
@@ -3022,12 +3022,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_readz_x2
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
           [llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects]>;
+          [IntrInaccessibleMemOnly]>;
 
   def int_aarch64_sme_readz_x4
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
           [llvm_i32_ty],
-          [IntrNoMem, IntrHasSideEffects]>;
+          [IntrInaccessibleMemOnly]>;
 
   def int_aarch64_sme_write_lane_zt
        :  DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty],
@@ -3038,7 +3038,7 @@ let TargetPrefix = "aarch64" in {
             [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrWriteMem]>;
 
 
-  def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+  def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
   def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
 
   class SME_OuterProduct_Intrinsic
@@ -3047,7 +3047,7 @@ let TargetPrefix = "aarch64" in {
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
            LLVMMatchType<0>,
-           llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+           llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
   def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
@@ -3112,7 +3112,7 @@ let TargetPrefix = "aarch64" in {
           [llvm_i32_ty,
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
            LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-           llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+           llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   def int_aarch64_sme_addha : SME_AddVectorToTile_Intrinsic;
   def int_aarch64_sme_addva : SME_AddVectorToTile_Intrinsic;
@@ -3232,56 +3232,56 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-                []>;
+                [IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_ArrayVector_Single_Index_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                 llvm_anyvector_ty,
                 LLVMMatchType<0>, llvm_i32_ty],
-                [ImmArg<ArgIndex<3>>]>;
+                [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
 
   class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, llvm_i32_ty],
-                [ImmArg<ArgIndex<4>>]>;
+                [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
 
   class SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic
     : DefaultAttrsIntrinsic<[],
                 [llvm_i32_ty,
                  llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, llvm_i32_ty],
-                [ImmArg<ArgIndex<6>>]>;
+                [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
 
   class SME2_VG2_Multi_Imm_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
@@ -3300,14 +3300,14 @@ let TargetPrefix = "aarch64" in {
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3396,50 +3396,50 @@ let TargetPrefix = "aarch64" in {
   class SME2_ZA_ArrayVector_Read_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_ArrayVector_Read_VG4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                 [llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_TileVector_Read_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [llvm_i32_ty, llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_TileVector_Read_VG4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                 [llvm_i32_ty, llvm_i32_ty],
-                []>;
+                [IntrReadMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_ArrayVector_Write_VG2_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_ArrayVector_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               []>;
+               [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   class SME2_Matrix_TileVector_Write_VG2_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty, llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               [ImmArg<ArgIndex<0>>]>;
+               [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   class SME2_Matrix_TileVector_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty, llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               [ImmArg<ArgIndex<0>>]>;
+               [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
   class SME2_VG2_Multi_Single_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3605,7 +3605,7 @@ let TargetPrefix = "aarch64" in {
   // Multi-vector zeroing
 
   foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
-    def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrNoMem, IntrHasSideEffects]>;
+    def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrWriteMem, IntrInaccessibleMemOnly]>;
   }
   
   // Multi-vector signed saturating doubling multiply high
@@ -4045,57 +4045,57 @@ let TargetPrefix = "aarch64" in {
           [llvm_i32_ty,
           llvm_nxv16i1_ty, llvm_nxv16i1_ty,
           llvm_nxv16i8_ty, llvm_nxv16i8_ty],
-          [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+          [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_LANE_VGx1_Intrinsic
    : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                llvm_nxv16i8_ty,
                                llvm_nxv16i8_ty,
                                llvm_i32_ty],
-                          [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<3>>]>;
+                          [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
 
   class SME_FP8_ZA_LANE_VGx2_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty,
                                 llvm_i32_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
+                            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
 
   class SME_FP8_ZA_LANE_VGx4_Intrinsic
    : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty,
                                 llvm_i32_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
+                            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
   class SME_FP8_ZA_SINGLE_VGx1_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_SINGLE_VGx2_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_SINGLE_VGx4_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                 llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                 llvm_nxv16i8_ty],
-                              [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                              [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_MULTI_VGx2_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
 
   class SME_FP8_ZA_MULTI_VGx4_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
                                  llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
-                            [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+                            [IntrInaccessibleMemOnly]>;
   //
   // CVT from FP8 to half-precision/BFloat16 multi-vector
   //
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index c008cda21cf05..b611dddb0b045 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -102,6 +102,8 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
   // Translated to the actual instructions in AArch64ISelLowering.cpp
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_quarter_tile_outer_product_pseudo<RegisterOperand zn_ty, RegisterOperand zm_ty, SMEMatrixTypeEnum za_flag>
@@ -119,6 +121,8 @@ class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, Regis
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -127,6 +131,8 @@ class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, Regist
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -135,6 +141,8 @@ class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, Regist
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, imm_ty:$i), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -142,6 +150,7 @@ class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -149,6 +158,7 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
       Pseudo<(outs), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum za_flag>
@@ -156,6 +166,7 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum
       Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
@@ -163,6 +174,8 @@ class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, R
       Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -171,6 +184,8 @@ class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOpe
       Pseudo<(outs multi_vector_ty:$Zd), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3), []> {
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -757,6 +772,8 @@ class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
   // Translated to the actual instructions in AArch64ISelLowering.cpp
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
 multiclass sme_add_vector_to_tile_u32<bit V, string mnemonic, SDPatternOperator op> {
@@ -1215,6 +1232,7 @@ class sme_mova_insert_pseudo<SMEMatrixTypeEnum za_flag>
   // Translated to the actual instructions in AArch64ISelLowering.cpp
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
+  let mayStore = 1;
 }
 
 multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
@@ -1409,6 +1427,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
                                    is_col, sme_elm_idx0_15, mnemonic> {
     bits<4> imm;
     let Inst{8-5} = imm;
+    let mayLoad = 1;
   }
   def _H : sme_tile_to_vector_inst<0b0, 0b01, ZPR16, !if(is_col, TileVectorOpV16,
                                                                  TileVectorOpH16),
@@ -1417,6 +1436,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
     bits<3> imm;
     let Inst{8}   = ZAn;
     let Inst{7-5} = imm;
+    let mayLoad = 1;
   }
   def _S : sme_tile_to_vector_inst<0b0, 0b10, ZPR32, !if(is_col, TileVectorOpV32,
                                                                  TileVectorOpH32),
@@ -1425,6 +1445,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
     bits<2> imm;
     let Inst{8-7} = ZAn;
     let Inst{6-5} = imm;
+    let mayLoad = 1;
   }
   def _D : sme_tile_to_vector_inst<0b0, 0b11, ZPR64, !if(is_col, TileVectorOpV64,
                                                                  TileVectorOpH64),
@@ -1433,12 +1454,14 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
     bits<1> imm;
     let Inst{8-6} = ZAn;
     let Inst{5}   = imm;
+    let mayLoad = 1;
   }
   def _Q : sme_tile_to_vector_inst<0b1, 0b11, ZPR128, !if(is_col, TileVectorOpV128,
                                                                   TileVectorOpH128),
                                    is_col, sme_elm_idx0_0, mnemonic> {
     bits<4> ZAn;
     let Inst{8-5} = ZAn;
+    let mayLoad = 1;
   }
 
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _B), ZPR8,
@@ -1909,7 +1932,9 @@ multiclass sme2_multivec_accum_add_sub_vg2<string mnemonic, bits<4> op,
   def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
   (!cast<Instruction>(NAME) matrix_ty:$ZAdn,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
 
-  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+    let mayLoad = 1;
+  }
   def : SME2_ZA_VG1x2_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
 }
 
@@ -1932,7 +1957,9 @@ multiclass sme2_multivec_accum_add_sub_vg4<string mnemonic, bits<4> op,
   def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
   (!cast<Instruction>(NAME) matrix_ty:$ZAdn,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
 
-  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+  def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+    let mayLoad = 1;
+  }
   def : SME2_ZA_VG1x4_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
 }
 
@@ -4441,6 +4468,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
                                                  uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<3> imm;
     let Inst{7-5} = imm;
+    let mayLoad = 1;
   }
 
   def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r,
@@ -4451,6 +4479,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
     bits<2> imm;
     let Inst{7}   = ZAn;
     let Inst{6-5} = imm;
+    let mayLoad = 1;
   }
 
   def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r,
@@ -4461,6 +4490,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
     bits<1> imm;
     let Inst{7-6} = ZAn;
     let Inst{5}   = imm;
+    let mayLoad = 1;
   }
 
   def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r,
@@ -4469,6 +4499,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
                                                 uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     let Inst{7-5} = ZAn;
+    let mayLoad = 1;
   }
 
   if !eq(mnemonic, "mova") then {
@@ -4583,6 +4614,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
     bits<2> imm;
     let Inst{6-5} = imm;
+    let mayLoad = 1;
   }
 
   def _H : sme2_mova_tile_to_vec_vg4_multi_base<0b01, v, {opc,0,?,?},
@@ -4594,6 +4626,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
     bits<1> imm;
     let Inst{6}   = ZAn;
     let Inst{5}   = imm;
+    let mayLoad = 1;
   }
 
   def _S : sme2_mova_tile_to_vec_vg4_multi_base<0b10, v, {opc,0,?,?},
@@ -4603,6 +4636,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                  uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
     bits<2> ZAn;
     let Inst{6-5} = ZAn;
+    let mayLoad = 1;
   }
 
   def _D : sme2_mova_tile_to_vec_vg4_multi_base<0b11, v, {opc,?,?,?},
@@ -4612,6 +4646,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
                                                 uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
     bits<3> ZAn;
     let Inst{7-5} = ZAn;
+    let mayLoad = 1;
   }
 
   if !eq(mnemonic, "mova") then {
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 1960987cce4ce..b187fbe214c1a 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -1,27 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
-define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_b:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.b, p0/m, za0h.b[w12, 0]
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 2]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.b, p0/m, za0h.b[w12, 0]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.b, p0/m, za0h.b[w12, 2]
 ; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 4]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 6]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 8]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 10]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 12]
-; CHECK-NEXT:    mov z0.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.b, p0/m, za0h.b[w12, 6]
+; CHECK-NEXT:    mov z4.b, p0/m, za0h.b[w12, 8]
+; CHECK-NEXT:    mov z5.b, p0/m, za0h.b[w12, 10]
+; CHECK-NEXT:    mov z6.b, p0/m, za0h.b[w12, 12]
+; CHECK-NEXT:    mov z7.b, p0/m, za0h.b[w12, 14]
+; CHECK-NEXT:    b dummy_use_8_nxv16i8
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -37,30 +36,33 @@ define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x
   %z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.12)
   %tileslice.14 = add i32 %tileslice, 14
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
-  ret <vscale x 16 x i8> %z0
+
+  ; Force retention of z0..z7
+  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  ret void
 }
 
-define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_b:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.b, p0/m, za0v.b[w12, 1]
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 3]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.b, p0/m, za0v.b[w12, 1]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.b, p0/m, za0v.b[w12, 3]
 ; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 5]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 7]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 9]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 11]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 13]
-; CHECK-NEXT:    mov z0.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.b, p0/m, za0v.b[w12, 7]
+; CHECK-NEXT:    mov z4.b, p0/m, za0v.b[w12, 9]
+; CHECK-NEXT:    mov z5.b, p0/m, za0v.b[w12, 11]
+; CHECK-NEXT:    mov z6.b, p0/m, za0v.b[w12, 13]
+; CHECK-NEXT:    mov z7.b, p0/m, za0v.b[w12, 15]
+; CHECK-NEXT:    b dummy_use_8_nxv16i8
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -77,22 +79,24 @@ define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x
   %z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.13)
   %tileslice.15 = add i32 %tileslice, 15
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
-  ret <vscale x 16 x i8> %z0
+
+  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  ret void
 }
 
-define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_h:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 2]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 2]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 6]
+; CHECK-NEXT:    b dummy_use_4_nxv8i16
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -100,22 +104,23 @@ define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
   %z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
   %tileslice.6 = add i32 %tileslice, 6
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
-  ret <vscale x 8 x i16> %z0
+
+  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  ret void
 }
 
-define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_h:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 1]
-; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 3]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za1v.h[w12, 1]
+; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 3]
 ; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 5]
-; CHECK-NEXT:    mov z0.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za1v.h[w12, 7]
+; CHECK-NEXT:    b dummy_use_4_nxv8i16
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -124,30 +129,31 @@ define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
   %z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.5)
   %tileslice.7 = add i32 %tileslice, 7
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
-  ret <vscale x 8 x i16> %z0
+
+  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  ret void
 }
 
-define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT:    mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT:    mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT:    b dummy_use_8_nxv8f16
   %z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -163,30 +169,32 @@ define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i
   %z6 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
-  ret <vscale x 8 x half> %z0
+
+  tail call void @dummy_use_8_nxv8f16(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+                                 <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+  ret void
 }
 
-define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
+define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
 ; CHECK-LABEL: extract_bf16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT:    mov z4.d, z7.d
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    mov z6.d, z7.d
+; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 1]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT:    mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT:    mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT:    b dummy_use_8_nxv8bf16
   %z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -202,53 +210,57 @@ define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x
   %z6 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
-  ret <vscale x 8 x bfloat> %z0
+
+  tail call void @dummy_use_8_nxv8bf16(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+                                  <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+  ret void
 }
 
-define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 2]
+; CHECK-NEXT:    b dummy_use_2_nxv4i32
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
-  ret <vscale x 4 x i32> %z0
+
+  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  ret void
 }
 
-define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 1]
-; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 1]
+; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 3]
+; CHECK-NEXT:    b dummy_use_2_nxv4i32
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
-  ret <vscale x 4 x i32> %z0
+
+  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  ret void
 }
 
-define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT:    mov z2.s, p0/m, za0h.s[w12, 1]
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 1]
 ; CHECK-NEXT:    mov z2.s, p0/m, za0v.s[w12, 2]
-; CHECK-NEXT:    mov z0.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z3.s, p0/m, za0v.s[w12, 3]
+; CHECK-NEXT:    b dummy_use_4_nxv4f32
   %z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -256,7 +268,9 @@ define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x
   %z2 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
   %tileslice.3 = add i32 %tileslice, 3
   %z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
-  ret <vscale x 4 x float> %z0
+
+  tail call void @dummy_use_4_nxv4f32(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+  ret void
 }
 
 define <vscale x 2 x i64> @extract_row_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
@@ -280,19 +294,20 @@ define <vscale x 2 x i64> @extract_col_d(<vscale x 2 x i64> %zd, <vscale x 2 x i
   ret <vscale x 2 x i64> %z0
 }
 
-define <vscale x 2 x double> @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
+define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    mov z1.d, p0/m, za0h.d[w12, 0]
-; CHECK-NEXT:    mov z0.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    mov z0.d, p0/m, za0h.d[w12, 0]
+; CHECK-NEXT:    mov z1.d, p0/m, za0v.d[w12, 1]
+; CHECK-NEXT:    b dummy_use_2_nxv2f64
   %z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
-  ret <vscale x 2 x double> %z0
+
+  tail call void @dummy_use_2_nxv2f64(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+  ret void
 }
 
 define <vscale x 16 x i8> @extract_row_q_v16i18(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
@@ -506,3 +521,35 @@ declare <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i3
 declare <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
 declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
+
+; ------------------------------------------------------------------------------
+; Dummy external functions to force code retention.
+; The compiler does not see their implementations, so it must keep the calls.
+; ------------------------------------------------------------------------------
+
+declare void @dummy_use_8_nxv16i8(
+  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>
+)
+
+declare void @dummy_use_4_nxv8i16(
+  <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>
+)
+
+declare void @dummy_use_8_nxv8f16(
+  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>
+)
+
+declare void @dummy_use_8_nxv8bf16(
+  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>
+)
+
+declare void @dummy_use_2_nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare void @dummy_use_4_nxv4f32(
+  <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>
+)
+
+declare void @dummy_use_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
index ca5399a0503e9..c01c96cc56975 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-b16b16 -verify-machineinstrs -force-streaming < %s | FileCheck %s
 
 ;
 ; Move Multi-Vector From Tile (Read) x2
@@ -7,82 +7,106 @@
 
 ; Horizontal
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg2_b(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 0:1]
-; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT:    mov { z2.b, z3.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
   %slice.14 = add i32 %slice, 14
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg2_h(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg2_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg2_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_horiz_vg2_s(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum     = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_horiz_vg2_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg2_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum     = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
@@ -107,82 +131,106 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i
 
 ; Vertical
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg2_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 0:1]
-; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT:    mov { z2.b, z3.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
   %slice.14 = add i32 %slice, 14
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg2_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg2_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg2_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
   %slice.6 = add i32 %slice, 6
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vert_vg2_s(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum     = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vert_vg2_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg2_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
   %slice.2 = add i32 %slice, 2
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2    = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum     = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
@@ -211,56 +259,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i3
 
 ; Horizontal
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg4_b(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 0:3]
-; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT:    mov { z4.b - z7.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT:    add z0.b, z0.b, z4.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
   %slice.12 = add i32 %slice, 12
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg4_h(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg4_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg4_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_horiz_vg4_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
@@ -305,56 +369,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 
 ; Vertical
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg4_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 0:3]
-; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT:    mov { z4.b - z7.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT:    add z0.b, z0.b, z4.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
   %slice.12 = add i32 %slice, 12
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg4_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum     = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg4_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    fadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum     = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg4_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vert_vg4_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
   %slice.4 = add i32 %slice, 4
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2    = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum     = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
@@ -399,214 +479,278 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 
 ; Move Multi-Vector From ZA (Read) x2
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x2_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice.7)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x2_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice.7)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum      = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x2_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice.7)
-  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum      = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x2_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice.7)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum      = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x2_s(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice.7)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum      = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x2_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice.7)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum      = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x2_d(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
-  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+  %val1     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+  %sum      = add <vscale x 2 x i64> %val1, %val2
+  ret <vscale x 2 x i64> %sum
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x2_f64(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x2_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
-  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
+  %val1     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+  %sum      = fadd <vscale x 2 x double> %val1, %val2
+  ret <vscale x 2 x double> %sum
 }
 
 ; Move Multi-Vector From ZA (Read) x4
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x4_b(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.b, z0.b, z4.b
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice.7)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+  %val1     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+  %val2     = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+  %sum      = add <vscale x 16 x i8> %val1, %val2
+  ret <vscale x 16 x i8> %sum
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x4_h(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice.7)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+  %val1     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+  %sum      = add <vscale x 8 x i16> %val1, %val2
+  ret <vscale x 8 x i16> %sum
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x4_f16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    fadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice.7)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+  %val1     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+  %sum      = fadd <vscale x 8 x half> %val1, %val2
+  ret <vscale x 8 x half> %sum
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x4_bf16(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    bfadd z0.h, z0.h, z4.h
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice.7)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+  %val1     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+  %val2     = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+  %sum      = fadd <vscale x 8 x bfloat> %val1, %val2
+  ret <vscale x 8 x bfloat> %sum
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x4_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x4_s(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.s, z0.s, z4.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice.7)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+  %val1     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+  %sum      = add <vscale x 4 x i32> %val1, %val2
+  ret <vscale x 4 x i32> %sum
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x4_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x4_f32(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    fadd z0.s, z0.s, z4.s
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice.7)
-  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2
+  %val1     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+  %val2     = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+  %sum      = fadd <vscale x 4 x float> %val1, %val2
+  ret <vscale x 4 x float> %sum
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x4_d(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    add z0.d, z0.d, z4.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
-  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+  %val1     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+  %sum      = add <vscale x 2 x i64> %val1, %val2
+  ret <vscale x 2 x i64> %sum
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x4_f64(i32 %slice) {
 ; CHECK-LABEL: za_read_vg1x4_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    fadd z0.d, z0.d, z4.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
   %slice.7 = add i32 %slice, 7
   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
-  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
+  %val1     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+  %val2     = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+  %sum      = fadd <vscale x 2 x double> %val1, %val2
+  ret <vscale x 2 x double> %sum
 }
 
 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)

>From cec3fbe4371d03ddc303d8cae6800a27d0265dd6 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Tue, 8 Apr 2025 16:57:03 +0000
Subject: [PATCH 2/5] Simplify dummy functions

---
 .../AArch64/sme-intrinsics-mova-extract.ll    | 79 +++++++------------
 1 file changed, 27 insertions(+), 52 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index b187fbe214c1a..0fd7fea93c30e 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -20,7 +20,7 @@ define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
 ; CHECK-NEXT:    mov z5.b, p0/m, za0h.b[w12, 10]
 ; CHECK-NEXT:    mov z6.b, p0/m, za0h.b[w12, 12]
 ; CHECK-NEXT:    mov z7.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT:    b dummy_use_8_nxv16i8
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -38,8 +38,8 @@ define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
 
   ; Force retention of z0..z7
-  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
-                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                      <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
   ret void
 }
 
@@ -62,7 +62,7 @@ define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
 ; CHECK-NEXT:    mov z5.b, p0/m, za0v.b[w12, 11]
 ; CHECK-NEXT:    mov z6.b, p0/m, za0v.b[w12, 13]
 ; CHECK-NEXT:    mov z7.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT:    b dummy_use_8_nxv16i8
+; CHECK-NEXT:    b use
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -80,8 +80,8 @@ define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
   %tileslice.15 = add i32 %tileslice, 15
   %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
 
-  tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
-                                 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+  tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+                      <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
   ret void
 }
 
@@ -96,7 +96,7 @@ define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 2]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
 ; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT:    b dummy_use_4_nxv8i16
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -105,7 +105,7 @@ define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
   %tileslice.6 = add i32 %tileslice, 6
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
 
-  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
   ret void
 }
 
@@ -120,7 +120,7 @@ define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 3]
 ; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 5]
 ; CHECK-NEXT:    mov z3.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT:    b dummy_use_4_nxv8i16
+; CHECK-NEXT:    b use
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
@@ -130,7 +130,7 @@ define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
   %tileslice.7 = add i32 %tileslice, 7
   %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
 
-  tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16>  %z0, <vscale x 8 x i16>  %z1, <vscale x 8 x i16>  %z2, <vscale x 8 x i16>  %z3)
+  tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
   ret void
 }
 
@@ -153,7 +153,7 @@ define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %ti
 ; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
 ; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
 ; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    b dummy_use_8_nxv8f16
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -170,8 +170,8 @@ define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %ti
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
 
-  tail call void @dummy_use_8_nxv8f16(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
-                                 <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+  tail call void @use(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+                      <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
   ret void
 }
 
@@ -194,7 +194,7 @@ define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32
 ; CHECK-NEXT:    mov z5.h, p0/m, za0h.h[w12, 5]
 ; CHECK-NEXT:    mov z6.h, p0/m, za0v.h[w12, 6]
 ; CHECK-NEXT:    mov z7.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT:    b dummy_use_8_nxv8bf16
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -211,8 +211,8 @@ define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32
   %tileslice.7 = add i32 %tileslice, 7
   %z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
 
-  tail call void @dummy_use_8_nxv8bf16(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
-                                  <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+  tail call void @use(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+                      <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
   ret void
 }
 
@@ -223,12 +223,12 @@ define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 0]
 ; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT:    b dummy_use_2_nxv4i32
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.2 = add i32 %tileslice, 2
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
 
-  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
   ret void
 }
 
@@ -239,13 +239,13 @@ define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 1]
 ; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT:    b dummy_use_2_nxv4i32
+; CHECK-NEXT:    b use
   %tileslice.1 = add i32 %tileslice, 1
   %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
   %tileslice.3 = add i32 %tileslice, 3
   %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
 
-  tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
   ret void
 }
 
@@ -260,7 +260,7 @@ define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %t
 ; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 1]
 ; CHECK-NEXT:    mov z2.s, p0/m, za0v.s[w12, 2]
 ; CHECK-NEXT:    mov z3.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT:    b dummy_use_4_nxv4f32
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -269,7 +269,7 @@ define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %t
   %tileslice.3 = add i32 %tileslice, 3
   %z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
 
-  tail call void @dummy_use_4_nxv4f32(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+  tail call void @use(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
   ret void
 }
 
@@ -301,12 +301,12 @@ define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %
 ; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.d, p0/m, za0h.d[w12, 0]
 ; CHECK-NEXT:    mov z1.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT:    b dummy_use_2_nxv2f64
+; CHECK-NEXT:    b use
   %z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
   %tileslice.1 = add i32 %tileslice, 1
   %z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
 
-  tail call void @dummy_use_2_nxv2f64(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+  tail call void @use(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
   ret void
 }
 
@@ -485,6 +485,7 @@ for.body:
 exit:
   %tmp1 = add <vscale x 4 x i32> %z0, %z1
   %res = add <vscale x 4 x i32> %tmp1, %z2
+  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
   ret <vscale x 4 x i32> %res
 }
 
@@ -523,33 +524,7 @@ declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i6
 declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
 
 ; ------------------------------------------------------------------------------
-; Dummy external functions to force code retention.
-; The compiler does not see their implementations, so it must keep the calls.
+; Dummy external function to force code retention.
 ; ------------------------------------------------------------------------------
 
-declare void @dummy_use_8_nxv16i8(
-  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
-  <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>
-)
-
-declare void @dummy_use_4_nxv8i16(
-  <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>
-)
-
-declare void @dummy_use_8_nxv8f16(
-  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
-  <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>
-)
-
-declare void @dummy_use_8_nxv8bf16(
-  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
-  <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>
-)
-
-declare void @dummy_use_2_nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-
-declare void @dummy_use_4_nxv4f32(
-  <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>
-)
-
-declare void @dummy_use_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare void @use(...)

>From b98f4c4cbe0a61b10f7401a7b866933af5f45009 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 10 Apr 2025 13:26:18 +0000
Subject: [PATCH 3/5] Rename SME classes not accessing ZA to SVE

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td | 201 +++++++++++-----------
 1 file changed, 99 insertions(+), 102 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7ced13c0c1962..28fa64a7cda75 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3141,9 +3141,9 @@ let TargetPrefix = "aarch64" in {
                               [IntrNoMem, IntrHasSideEffects]>;
 
   def int_aarch64_sme_za_enable
-      : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+      : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
   def int_aarch64_sme_za_disable
-      : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+      : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
   // Clamp
   //
@@ -3283,13 +3283,13 @@ let TargetPrefix = "aarch64" in {
                  LLVMMatchType<0>, llvm_i32_ty],
                 [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
 
-  class SME2_VG2_Multi_Imm_Intrinsic
+  class SVE2_VG2_Multi_Imm_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                 [llvm_anyvector_ty, LLVMMatchType<0>,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  class SME2_VG4_Multi_Imm_Intrinsic
+  class SVE2_VG4_Multi_Imm_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
                 [llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3309,13 +3309,13 @@ let TargetPrefix = "aarch64" in {
                 LLVMMatchType<0>,  LLVMMatchType<0>],
                [IntrWriteMem, IntrInaccessibleMemOnly]>;
 
-  class SME2_VG2_Multi_Single_Intrinsic
+  class SVE2_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>],
                 [IntrNoMem]>;
 
-  class SME2_VG4_Multi_Single_Intrinsic
+  class SVE2_VG4_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                             [LLVMMatchType<0>,  LLVMMatchType<0>,
@@ -3323,13 +3323,13 @@ let TargetPrefix = "aarch64" in {
                              LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_VG2_Multi_Multi_Intrinsic
+  class SVE2_VG2_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
 
-  class SME2_VG4_Multi_Multi_Intrinsic
+  class SVE2_VG4_Multi_Multi_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>,  LLVMMatchType<0>],
                             [LLVMMatchType<0>,  LLVMMatchType<0>,
@@ -3353,42 +3353,42 @@ let TargetPrefix = "aarch64" in {
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>], [IntrNoMem]>;
 
-  class SME2_CVT_VG2_SINGLE_Intrinsic
+  class SVE2_CVT_VG2_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_VG2_SINGLE_BF16_Intrinsic
+  class SVE2_CVT_VG2_SINGLE_BF16_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
   
-  class SME2_CVT_WIDENING_VG2_Intrinsic
+  class SVE2_CVT_WIDENING_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
   
 
-  class SME2_CVT_VG4_SINGLE_Intrinsic
+  class SVE2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_X2_Intrinsic
+  class SVE2_CVT_X2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<1>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_X4_Intrinsic
+  class SVE2_CVT_X4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>],
                             [IntrNoMem]>;
 
-  class SME2_BFMLS_Intrinsic
+  class SVE2_BFMLS_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
                             [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
                             [IntrNoMem]>;
 
-  class SME2_BFMLS_Lane_Intrinsic
+  class SVE2_BFMLS_Lane_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
                             [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<3>>]>;
@@ -3441,13 +3441,13 @@ let TargetPrefix = "aarch64" in {
                 LLVMMatchType<0>,  LLVMMatchType<0>],
                [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
 
-  class SME2_VG2_Multi_Single_Single_Intrinsic
+  class SVE2_VG2_Multi_Single_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
 
-  class SME2_VG4_Multi_Single_Single_Intrinsic
+  class SVE2_VG4_Multi_Single_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3465,11 +3465,11 @@ let TargetPrefix = "aarch64" in {
                 [LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
-  class SME2_VG2_Unpk_Intrinsic
+  class SVE2_VG2_Unpk_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
 
-  class SME2_VG4_Unpk_Intrinsic
+  class SVE2_VG4_Unpk_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                              LLVMMatchType<0>, LLVMMatchType<0>],
                 [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
@@ -3510,33 +3510,33 @@ let TargetPrefix = "aarch64" in {
   // Multi-vector rounding shift left intrinsics
   //
 
-  def int_aarch64_sve_srshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_urshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_srshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
-  def int_aarch64_sve_urshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sve_srshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_urshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_srshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sve_urshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-  def int_aarch64_sve_srshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_urshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_srshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_urshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_srshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_urshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_srshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_urshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector saturating rounding shift right intrinsics
 
-  def int_aarch64_sve_sqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
-  def int_aarch64_sve_sqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_uqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_uqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
-  def int_aarch64_sve_sqrshru_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshru_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshru_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshru_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
-  def int_aarch64_sve_sqrshrun_x2 : SME2_VG2_Multi_Imm_Intrinsic;
-  def int_aarch64_sve_sqrshrun_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrun_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+  def int_aarch64_sve_sqrshrun_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
 
   //
   // Multi-vector multiply-add/subtract long
@@ -3596,25 +3596,23 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
   def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
-  def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic;
-  def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic;
+  def int_aarch64_sve_bfmlslb : SVE2_BFMLS_Intrinsic;
+  def int_aarch64_sve_bfmlslb_lane : SVE2_BFMLS_Lane_Intrinsic;
 
-  def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
-  def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
+  def int_aarch64_sve_bfmlslt : SVE2_BFMLS_Intrinsic;
+  def int_aarch64_sve_bfmlslt_lane : SVE2_BFMLS_Lane_Intrinsic;
 
   // Multi-vector zeroing
 
   foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
     def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrWriteMem, IntrInaccessibleMemOnly]>;
   }
-  
   // Multi-vector signed saturating doubling multiply high
+  def int_aarch64_sve_sqdmulh_single_vgx2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_sqdmulh_single_vgx4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-  def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_sqdmulh_single_vgx4 : SME2_VG4_Multi_Single_Intrinsic;
-
-  def int_aarch64_sve_sqdmulh_vgx2 : SME2_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sve_sqdmulh_vgx4 : SME2_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_sqdmulh_vgx2 : SVE2_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sve_sqdmulh_vgx4 : SVE2_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector floating-point round to integral value
 
@@ -3629,11 +3627,11 @@ let TargetPrefix = "aarch64" in {
 
   foreach ty = ["f", "s", "u"] in {
     foreach instr = ["max", "min"] in {
-      def int_aarch64_sve_ # ty # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-      def int_aarch64_sve_ # ty # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-      def int_aarch64_sve_ # ty # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
-      def int_aarch64_sve_ # ty # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+      def int_aarch64_sve_ # ty # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
     }
   }
 
@@ -3642,11 +3640,11 @@ let TargetPrefix = "aarch64" in {
   //
 
   foreach instr = ["fmaxnm", "fminnm"] in {
-    def int_aarch64_sve_ # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-    def int_aarch64_sve_ # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+    def int_aarch64_sve_ # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+    def int_aarch64_sve_ # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
-    def int_aarch64_sve_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
-    def int_aarch64_sve_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+    def int_aarch64_sve_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+    def int_aarch64_sve_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
   }
 
   //
@@ -3654,8 +3652,8 @@ let TargetPrefix = "aarch64" in {
   //
 
   foreach instr = ["famax", "famin"] in {
-    def int_aarch64_sme_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
-    def int_aarch64_sme_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+    def int_aarch64_sme_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+    def int_aarch64_sme_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
   }
 
   //
@@ -3677,48 +3675,47 @@ let TargetPrefix = "aarch64" in {
   //
   //Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
   //
-  
-  def int_aarch64_sve_fcvtl_widen_x2  : SME2_CVT_WIDENING_VG2_Intrinsic;
+  def int_aarch64_sve_fcvtl_widen_x2  : SVE2_CVT_WIDENING_VG2_Intrinsic;
 
   //
   // Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
   //
-  def int_aarch64_sve_fcvtn_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_bfcvtn_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
+  def int_aarch64_sve_fcvtn_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_bfcvtn_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
 
   //
   // Multi-vector convert to/from floating-point.
   //
-  def int_aarch64_sve_fcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
-  def int_aarch64_sve_fcvtzs_x2 : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_fcvtzu_x2 : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_scvtf_x2  : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_ucvtf_x2  : SME2_CVT_X2_Intrinsic;
-  def int_aarch64_sve_fcvtzs_x4 : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
-  def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
+  def int_aarch64_sve_fcvt_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_bfcvt_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
+  def int_aarch64_sve_fcvtzs_x2 : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_fcvtzu_x2 : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_scvtf_x2  : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_ucvtf_x2  : SVE2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_fcvtzs_x4 : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_fcvtzu_x4 : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_scvtf_x4  : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_ucvtf_x4  : SVE2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_fcvt_widen_x2 : SVE2_CVT_WIDENING_VG2_Intrinsic;
   //
   // Multi-vector saturating extract narrow
   //
-  def int_aarch64_sve_sqcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtu_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvt_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvt_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtu_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvt_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvt_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtu_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvt_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvt_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtu_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
 
   //
   // Multi-vector saturating extract narrow and interleave
   //
-  def int_aarch64_sve_sqcvtn_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvtn_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtun_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtn_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_uqcvtn_x4  : SME2_CVT_VG4_SINGLE_Intrinsic;
-  def int_aarch64_sve_sqcvtun_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtn_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvtn_x2  : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtun_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtn_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_uqcvtn_x4  : SVE2_CVT_VG4_SINGLE_Intrinsic;
+  def int_aarch64_sve_sqcvtun_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
 
   //
   // Multi-Single add/sub
@@ -3737,15 +3734,15 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector clamps
-  def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_uclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_fclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
 
-  def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
-  def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_sclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_uclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_fclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
 
   //
   // Multi-vector add/sub and accumulate into ZA
@@ -3782,8 +3779,8 @@ let TargetPrefix = "aarch64" in {
   //
   // Multi-Single Vector add
   //
-  def int_aarch64_sve_add_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sve_add_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sve_add_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sve_add_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
 
   // 2-way and 4-way multi-vector signed/unsigned integer dot-product
   foreach ty = ["s", "u"] in {
@@ -3841,10 +3838,10 @@ let TargetPrefix = "aarch64" in {
   //
   // Signed/unsigned multi-vector unpacks
   //
-  def int_aarch64_sve_sunpk_x2 : SME2_VG2_Unpk_Intrinsic;
-  def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic;
-  def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic;
-  def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic;
+  def int_aarch64_sve_sunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+  def int_aarch64_sve_uunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+  def int_aarch64_sve_sunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
+  def int_aarch64_sve_uunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
 
   // 2-way and 4-way vector selects
   def int_aarch64_sve_sel_x2  : SVE2_VG2_Sel_Intrinsic;
@@ -4176,4 +4173,4 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2  : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
-}
+}
\ No newline at end of file

>From ca0a3e1b1e70ff841c90c96e93997b38540fdd1b Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 11 Apr 2025 15:20:00 +0000
Subject: [PATCH 4/5] Fix incorrect properties for some ZA intrinsics

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td | 66 ++++++++++++++++-------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 28fa64a7cda75..2c6129cedebbf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3300,14 +3300,14 @@ let TargetPrefix = "aarch64" in {
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>],
-               [IntrWriteMem, IntrInaccessibleMemOnly]>;
+               [IntrInaccessibleMemOnly]>;
 
   class SME2_ZA_Write_VG4_Intrinsic
    : DefaultAttrsIntrinsic<[],
                [llvm_i32_ty,
                 llvm_anyvector_ty, LLVMMatchType<0>,
                 LLVMMatchType<0>,  LLVMMatchType<0>],
-               [IntrWriteMem, IntrInaccessibleMemOnly]>;
+               [IntrInaccessibleMemOnly]>;
 
   class SVE2_VG2_Multi_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3720,18 +3720,48 @@ let TargetPrefix = "aarch64" in {
   //
   // Multi-Single add/sub
   //
-  def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
-  def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
-  def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+  class SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>,
+          LLVMMatchType<0>],
+          [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+  class SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+          LLVMMatchType<0>],
+        [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+  def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+  def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
+  def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
 
   //
   // Multi-Multi add/sub
   //
-  def int_aarch64_sme_add_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
-  def int_aarch64_sme_add_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
-  def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+  class SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>],
+          [IntrInaccessibleMemOnly, IntrWriteMem]>;
+          
+  class SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic
+      : DefaultAttrsIntrinsic<[],
+          [llvm_i32_ty,
+          llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>],
+          [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+  def int_aarch64_sme_add_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_add_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
 
   // Multi-vector clamps
   def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
@@ -4027,12 +4057,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fp8_fmlalltt      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
 
-  class SME2_FP8_CVT_X2_Single_Intrinsic
+  class SVE2_FP8_CVT_X2_Single_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [llvm_nxv16i8_ty],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
 
-  class SME2_FP8_CVT_Single_X4_Intrinsic
+  class SVE2_FP8_CVT_Single_X4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
@@ -4096,14 +4126,14 @@ let TargetPrefix = "aarch64" in {
   //
   // CVT from FP8 to half-precision/BFloat16 multi-vector
   //
-  def int_aarch64_sve_fp8_cvt1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
-  def int_aarch64_sve_fp8_cvt2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvt1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvt2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
 
   //
   // CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector
   //
-  def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
-  def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvtl1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+  def int_aarch64_sve_fp8_cvtl2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
 
   //
   // CVT to FP8 from half-precision/BFloat16/single-precision multi-vector
@@ -4113,8 +4143,8 @@ let TargetPrefix = "aarch64" in {
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
 
-  def int_aarch64_sve_fp8_cvt_x4  : SME2_FP8_CVT_Single_X4_Intrinsic;
-  def int_aarch64_sve_fp8_cvtn_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
+  def int_aarch64_sve_fp8_cvt_x4  : SVE2_FP8_CVT_Single_X4_Intrinsic;
+  def int_aarch64_sve_fp8_cvtn_x4 : SVE2_FP8_CVT_Single_X4_Intrinsic;
 
   // FP8 outer product
   def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic;

>From cbc738cfef98e2640eb223af2776b9f11c010d46 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Mon, 14 Apr 2025 12:59:02 +0000
Subject: [PATCH 5/5] Remove dummy use from one of the tests

---
 llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 0fd7fea93c30e..12d945f575f68 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -485,7 +485,6 @@ for.body:
 exit:
   %tmp1 = add <vscale x 4 x i32> %z0, %z1
   %res = add <vscale x 4 x i32> %tmp1, %z2
-  tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
   ret <vscale x 4 x i32> %res
 }