[llvm] 028c722 - [SME2][AArch64] Add multi-multi multiply-add long long intrinsics

Mon Feb 20 06:02:14 PST 2023

Author: Kerry McLaughlin
Date: 2023-02-20T14:01:56Z
New Revision: 028c722ac8ea65db0cf24e77629644a5c6f322c5

URL: https://github.com/llvm/llvm-project/commit/028c722ac8ea65db0cf24e77629644a5c6f322c5
DIFF: https://github.com/llvm/llvm-project/commit/028c722ac8ea65db0cf24e77629644a5c6f322c5.diff

LOG: [SME2][AArch64] Add multi-multi multiply-add long long intrinsics

Adds intrinsics for the following SME2 instructions (2 & 4 vectors):
 - smlall
 - smlsll
 - umlall
 - umlsll
 - usmlall

NOTE: These intrinsics are still in development and are subject to future changes.

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D143277

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
    llvm/lib/Target/AArch64/SMEInstrFormats.td
    llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b2ae369b47e5..68404d4a66e7 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3064,6 +3064,9 @@ let TargetPrefix = "aarch64" in {
         def int_aarch64_sme_ # ty # instr # _ # za # _single_vg4x1 : SME2_Matrix_ArrayVector_Single_Single_Intrinsic;
         def int_aarch64_sme_ # ty # instr # _ # za # _single_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
         def int_aarch64_sme_ # ty # instr # _ # za # _single_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+        def int_aarch64_sme_ # ty # instr # _ # za # _vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
+        def int_aarch64_sme_ # ty # instr # _ # za # _vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
       }
     }
   }
@@ -3075,6 +3078,9 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_usmla_za32_single_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
   def int_aarch64_sme_usmla_za32_single_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
 
+  def int_aarch64_sme_usmla_za32_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
+  def int_aarch64_sme_usmla_za32_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+
   // Multi-vector signed saturating doubling multiply high
 
   def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;

diff  --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 5f4a565b7971..f2d8163258bc 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -518,8 +518,8 @@ defm SMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlall", 0b000>;
 defm SMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"smlall", 0b0000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x1>;
 defm SMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"smlall", 0b00000, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x2>;
 defm SMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"smlall", 0b01000, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x4>;
-defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b0000, MatrixOp32, ZZ_b_mul_r>;
-defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b0000, MatrixOp32, ZZZZ_b_mul_r>;
+defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b0000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x2>;
+defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b0000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x4>;
 
 def  USMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"usmlall", 0b001>;
 defm USMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"usmlall", 0b100>;
@@ -527,8 +527,8 @@ defm USMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"usmlall", 0b100>;
 defm USMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"usmlall", 0b0001, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x1>;
 defm USMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"usmlall", 0b00001, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x2>;
 defm USMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"usmlall", 0b01001, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x4>;
-defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b0001, MatrixOp32, ZZ_b_mul_r>;
-defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b0001, MatrixOp32, ZZZZ_b_mul_r>;
+defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b0001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x2>;
+defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b0001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x4>;
 
 def  SMLSLL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"smlsll", 0b010>;
 defm SMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlsll", 0b001>;
@@ -536,8 +536,8 @@ defm SMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlsll", 0b001>;
 defm SMLSLL_MZZ_BtoS       : sme2_mla_ll_array_single<"smlsll", 0b0010, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x1>;
 defm SMLSLL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"smlsll", 0b00010, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x2>;
 defm SMLSLL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"smlsll", 0b01010, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x4>;
-defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b0010, MatrixOp32, ZZ_b_mul_r>;
-defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b0010, MatrixOp32, ZZZZ_b_mul_r>;
+defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b0010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x2>;
+defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b0010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x4>;
 
 def  UMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"umlall", 0b100>;
 defm UMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlall", 0b010>;
@@ -545,8 +545,8 @@ defm UMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlall", 0b010>;
 defm UMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"umlall", 0b0100, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x1>;
 defm UMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"umlall", 0b00100, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x2>;
 defm UMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"umlall", 0b01100, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x4>;
-defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b0100, MatrixOp32, ZZ_b_mul_r>;
-defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b0100, MatrixOp32, ZZZZ_b_mul_r>;
+defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b0100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x2>;
+defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b0100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x4>;
 
 def  SUMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"sumlall", 0b101>;
 defm SUMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"sumlall", 0b110>;
@@ -560,8 +560,8 @@ defm UMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlsll", 0b011>;
 defm UMLSLL_MZZ_BtoS       : sme2_mla_ll_array_single<"umlsll", 0b0110, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x1>;
 defm UMLSLL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"umlsll", 0b00110, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x2>;
 defm UMLSLL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"umlsll", 0b01110, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x4>;
-defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r>;
-defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r>;
+defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x2>;
+defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x4>;
 
 defm BMOPA_MPPZZ_S : sme2_int_bmopx_tile<"bmopa", 0b100, int_aarch64_sme_bmopa_za32>;
 defm BMOPS_MPPZZ_S : sme2_int_bmopx_tile<"bmops", 0b101, int_aarch64_sme_bmops_za32>;
@@ -745,8 +745,8 @@ defm SMLALL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"smlall", 0b00>;
 defm SMLALL_MZZ_HtoD       : sme2_mla_ll_array_single<"smlall", 0b1000, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x1>;
 defm SMLALL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"smlall", 0b10000, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x2>;
 defm SMLALL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"smlall", 0b11000, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x4>;
-defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall",  0b1000, MatrixOp64, ZZ_h_mul_r>;
-defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall",  0b1000, MatrixOp64, ZZZZ_h_mul_r>;
+defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall",  0b1000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x2>;
+defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall",  0b1000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x4>;
 
 def  SMLSLL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"smlsll", 0b01>;
 defm SMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlsll", 0b01>;
@@ -754,8 +754,8 @@ defm SMLSLL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"smlsll", 0b01>;
 defm SMLSLL_MZZ_HtoD       : sme2_mla_ll_array_single<"smlsll", 0b1010, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x1>;
 defm SMLSLL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"smlsll", 0b10010, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x2>;
 defm SMLSLL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"smlsll", 0b11010, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x4>;
-defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll",  0b1010, MatrixOp64, ZZ_h_mul_r>;
-defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll",  0b1010, MatrixOp64, ZZZZ_h_mul_r>;
+defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll",  0b1010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x2>;
+defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll",  0b1010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x4>;
 
 def  UMLALL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"umlall", 0b10>;
 defm UMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlall", 0b10>;
@@ -763,8 +763,8 @@ defm UMLALL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"umlall", 0b10>;
 defm UMLALL_MZZ_HtoD       : sme2_mla_ll_array_single<"umlall", 0b1100, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x1>;
 defm UMLALL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"umlall", 0b10100, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x2>;
 defm UMLALL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"umlall", 0b11100, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x4>;
-defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall",  0b1100, MatrixOp64, ZZ_h_mul_r>;
-defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall",  0b1100, MatrixOp64, ZZZZ_h_mul_r>;
+defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall",  0b1100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x2>;
+defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall",  0b1100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x4>;
 
 def  UMLSLL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"umlsll", 0b11>;
 defm UMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlsll", 0b11>;
@@ -772,8 +772,8 @@ defm UMLSLL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"umlsll", 0b11>;
 defm UMLSLL_MZZ_HtoD       : sme2_mla_ll_array_single<"umlsll", 0b1110, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x1>;
 defm UMLSLL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"umlsll", 0b10110, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x2>;
 defm UMLSLL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"umlsll", 0b11110, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x4>;
-defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll",  0b1110, MatrixOp64, ZZ_h_mul_r>;
-defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll",  0b1110, MatrixOp64, ZZZZ_h_mul_r>;
+defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll",  0b1110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x2>;
+defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll",  0b1110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x4>;
 }
 
 let Predicates = [HasSME2, HasSMEF64F64] in {

diff  --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 2f80351b85da..a3f6f6c433ce 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -2817,8 +2817,13 @@ class sme2_mla_ll_array_vg2_multi<bits<4> op, MatrixOperand matrix_ty,
 
 multiclass sme2_mla_ll_array_vg2_multi<string mnemonic, bits<4> op,
                                        MatrixOperand matrix_ty,
-                                       RegisterOperand vector_ty> {
-  def NAME : sme2_mla_ll_array_vg2_multi<op, matrix_ty, vector_ty, mnemonic>;
+                                       RegisterOperand vector_ty,
+                                       ValueType vt, SDPatternOperator intrinsic> {
+  def NAME : sme2_mla_ll_array_vg2_multi<op, matrix_ty, vector_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+  def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, uimm1s4range, vector_ty, SMEMatrixArray>;
+
+  def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME, intrinsic, uimm1s4range, vt, tileslicerange1s4>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
                  (!cast<Instruction>(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, vector_ty:$Zn, vector_ty:$Zm), 0>;
@@ -2856,8 +2861,13 @@ class sme2_mla_ll_array_vg4_multi<bits<4> op,MatrixOperand matrix_ty,
 
 multiclass sme2_mla_ll_array_vg4_multi<string mnemonic, bits<4> op,
                                        MatrixOperand matrix_ty,
-                                       RegisterOperand vector_ty> {
-  def NAME : sme2_mla_ll_array_vg4_multi<op, matrix_ty, vector_ty, mnemonic>;
+                                       RegisterOperand vector_ty,
+                                       ValueType vt, SDPatternOperator intrinsic> {
+  def NAME : sme2_mla_ll_array_vg4_multi<op, matrix_ty, vector_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+  def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, uimm1s4range, vector_ty, SMEMatrixArray>;
+
+  def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME, intrinsic, uimm1s4range, vt, tileslicerange1s4>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
                  (!cast<Instruction>(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm, vector_ty:$Zn, vector_ty:$Zm), 0>;

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
index e844206cb588..737cea559355 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
@@ -101,6 +101,88 @@ define void @multi_vector_mul_add_single_long_vg4x4_s16(i32 %slice, <vscale x 8
   ret void
 }
 
+; Multi x2
+
+define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  ret void
+}
+
+define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  ret void
+}
+
+; Multi x4
+
+define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  ret void
+}
+
+define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  ret void
+}
+
 ; UMLALL
 
 ; Single x1
@@ -199,6 +281,88 @@ define void @multi_vector_mul_add_single_long_vg4x4_u16(i32 %slice, <vscale x 8
   ret void
 }
 
+; Multi x2
+
+define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  ret void
+}
+
+define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  ret void
+}
+
+; Multi x4
+
+define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  ret void
+}
+
+define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  ret void
+}
+
 ; SMLSLL
 
 ; Single x1
@@ -297,6 +461,88 @@ define void @multi_vector_mul_sub_single_long_vg4x4_s16(i32 %slice, <vscale x 8
   ret void
 }
 
+; Multi x2
+
+define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  ret void
+}
+
+define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  ret void
+}
+
+; Multi x4
+
+define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  ret void
+}
+
+define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  ret void
+}
+
 ; UMLSLL
 
 ; Single x1
@@ -395,6 +641,88 @@ define void @multi_vector_mul_sub_single_long_vg4x4_u16(i32 %slice, <vscale x 8
   ret void
 }
 
+; Multi x2
+
+define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  ret void
+}
+
+define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+  ret void
+}
+
+; Multi x4
+
+define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  ret void
+}
+
+define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+  ret void
+}
+
 ;
 ; SUMLALL
 ;
@@ -488,6 +816,49 @@ define void @multi_vector_mul_add_single_unsigned_long_vg4x4_s8(i32 %slice, <vsc
   ret void
 }
 
+; Multi x2
+
+define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
+; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x2_u8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
+  ret void
+}
+
+; Multi x4
+
+define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
+; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z26.d, z7.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z31.d, z4.d
+; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z29.d, z2.d
+; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  %slice.4 = add i32 %slice, 4
+  call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
+  ret void
+}
+
 declare void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
@@ -496,6 +867,12 @@ declare void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x
 declare void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 
+declare void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+declare void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
 declare void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
@@ -504,6 +881,12 @@ declare void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x
 declare void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 
+declare void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+declare void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
 declare void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
@@ -512,6 +895,12 @@ declare void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x
 declare void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 
+declare void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+declare void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
 declare void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
@@ -520,9 +909,18 @@ declare void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x
 declare void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 
+declare void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+declare void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
 declare void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 
 declare void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+declare void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)