[llvm] 2e35d68 - [AArch64][SME2] Add multi-vector multiply-add long intrinsics.
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 20 03:33:53 PST 2023
Author: Kerry McLaughlin
Date: 2023-01-20T11:33:29Z
New Revision: 2e35d684d798f8e330e5a18b524cef32ede9be89
URL: https://github.com/llvm/llvm-project/commit/2e35d684d798f8e330e5a18b524cef32ede9be89
DIFF: https://github.com/llvm/llvm-project/commit/2e35d684d798f8e330e5a18b524cef32ede9be89.diff
LOG: [AArch64][SME2] Add multi-vector multiply-add long intrinsics.
Adds (single, multi & indexed) intrinsics for the following:
- bfmlal/bfmlsl
- fmlal/fmlsl
- smlal/smlsl
- umlal/umlsl
This patch also extends SelectSMETileSlice to handle scaled vector select offsets.
NOTE: These intrinsics are still in development and are subject to future changes.
Reviewed By: CarolineConcatto
Differential Revision: https://reviews.llvm.org/D142004
Added:
llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
llvm/lib/Target/AArch64/SMEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 5a28a6eb0370..8bd1f8071029 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2697,6 +2697,12 @@ let TargetPrefix = "aarch64" in {
// SME2 Intrinsics
//
+ class SME2_Matrix_ArrayVector_Single_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>],
+ []>;
+
class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
@@ -2725,6 +2731,13 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[]>;
+ class SME2_Matrix_ArrayVector_Single_Index_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty,
+ LLVMMatchType<0>, llvm_i32_ty],
+ [ImmArg<ArgIndex<3>>]>;
+
class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
@@ -2757,4 +2770,23 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_fmls_lane_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
def int_aarch64_sme_fmla_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
def int_aarch64_sme_fmls_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
+
+ //
+ // Multi-vector multiply-add/subtract long
+ //
+
+ foreach ty = ["f", "s", "u"] in {
+ foreach instr = ["mlal", "mlsl"] in {
+ def int_aarch64_sme_ # ty # instr # _single_vg2x1 : SME2_Matrix_ArrayVector_Single_Single_Intrinsic;
+ def int_aarch64_sme_ # ty # instr # _single_vg2x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_ # ty # instr # _single_vg2x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+ def int_aarch64_sme_ # ty # instr # _vg2x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_ # ty # instr # _vg2x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+
+ def int_aarch64_sme_ # ty # instr # _lane_vg2x1 : SME2_Matrix_ArrayVector_Single_Index_Intrinsic;
+ def int_aarch64_sme_ # ty # instr # _lane_vg2x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
+ def int_aarch64_sme_ # ty # instr # _lane_vg2x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
+ }
+ }
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 666b6292e28f..018d4f3201c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -369,9 +369,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
}
- template <unsigned Scale>
+ template <unsigned MaxIdx, unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
- return SelectSMETileSlice(N, Scale, Vector, Offset);
+ return SelectSMETileSlice(N, MaxIdx, Vector, Offset, Scale);
}
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
@@ -443,8 +443,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
- bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector,
- SDValue &Offset);
+ bool SelectSMETileSlice(SDValue N, unsigned MaxSize, SDValue &Vector,
+ SDValue &Offset, unsigned Scale = 1);
bool SelectAllActivePredicate(SDValue N);
};
@@ -5895,8 +5895,9 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
return TLI->isAllActivePredicate(*CurDAG, N);
}
-bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale,
- SDValue &Base, SDValue &Offset) {
+bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
+ SDValue &Base, SDValue &Offset,
+ unsigned Scale) {
if (N.getOpcode() != ISD::ADD) {
Base = N;
Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
@@ -5909,13 +5910,12 @@ bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale,
if (auto C = dyn_cast<ConstantSDNode>(RHS)) {
int64_t ImmOff = C->getSExtValue();
- unsigned MaxSize = (1 << Scale) - 1;
- if (ImmOff < 0 || ImmOff > MaxSize)
+ if ((ImmOff < 0 || ImmOff > MaxSize) || (ImmOff % Scale != 0))
return false;
Base = LHS;
- Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64);
+ Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index e8a186057ff1..92e197da0669 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -291,77 +291,77 @@ defm SQDMULH_VG4_4ZZ : sme2_int_sve_destructive_vector_vg4_single<"sqdmulh", 0b1
defm SQDMULH_VG2_2Z2Z : sme2_int_sve_destructive_vector_vg2_multi<"sqdmulh", 0b1000000>;
defm SQDMULH_VG4_4Z4Z : sme2_int_sve_destructive_vector_vg4_multi<"sqdmulh", 0b1000000>;
-defm FMLAL_MZZI : sme2_mla_long_array_index<"fmlal", 0b10, 0b00>;
-defm FMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlal", 0b00>;
-defm FMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlal", 0b00>;
-defm FMLAL_MZZ : sme2_mla_long_array_single<"fmlal", 0b00, 0b00>;
-defm FMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b00>;
-defm FMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b00>;
-defm FMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b00>;
-defm FMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b00>;
-
-defm FMLSL_MZZI : sme2_mla_long_array_index<"fmlsl", 0b10, 0b01>;
-defm FMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlsl", 0b01>;
-defm FMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlsl", 0b01>;
-defm FMLSL_MZZ : sme2_mla_long_array_single<"fmlsl", 0b00, 0b01>;
-defm FMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlsl", 0b01>;
-defm FMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlsl", 0b01>;
-defm FMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlsl", 0b01>;
-defm FMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlsl", 0b01>;
-
-defm BFMLAL_MZZI : sme2_mla_long_array_index<"bfmlal", 0b10, 0b10>;
-defm BFMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlal", 0b10>;
-defm BFMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlal", 0b10>;
-defm BFMLAL_MZZ : sme2_mla_long_array_single<"bfmlal", 0b00, 0b10>;
-defm BFMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlal", 0b10>;
-defm BFMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlal", 0b10>;
-defm BFMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlal", 0b10>;
-defm BFMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlal", 0b10>;
-
-defm BFMLSL_MZZI : sme2_mla_long_array_index<"bfmlsl", 0b10, 0b11>;
-defm BFMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlsl", 0b11>;
-defm BFMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlsl", 0b11>;
-defm BFMLSL_MZZ : sme2_mla_long_array_single<"bfmlsl", 0b00, 0b11>;
-defm BFMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlsl", 0b11>;
-defm BFMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlsl", 0b11>;
-defm BFMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlsl", 0b11>;
-defm BFMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlsl", 0b11>;
-
-defm SMLAL_MZZI : sme2_mla_long_array_index<"smlal", 0b11, 0b00>;
-defm SMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlal", 0b00>;
-defm SMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlal", 0b00>;
-defm SMLAL_MZZ : sme2_mla_long_array_single<"smlal",0b01, 0b00>;
-defm SMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlal", 0b00>;
-defm SMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlal", 0b00>;
-defm SMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlal", 0b00>;
-defm SMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlal", 0b00>;
-
-defm SMLSL_MZZI : sme2_mla_long_array_index<"smlsl", 0b11, 0b01>;
-defm SMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlsl", 0b01>;
-defm SMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlsl", 0b01>;
-defm SMLSL_MZZ : sme2_mla_long_array_single<"smlsl",0b01, 0b01>;
-defm SMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlsl", 0b01>;
-defm SMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlsl", 0b01>;
-defm SMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlsl", 0b01>;
-defm SMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlsl", 0b01>;
-
-defm UMLAL_MZZI : sme2_mla_long_array_index<"umlal", 0b11, 0b10>;
-defm UMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlal", 0b10>;
-defm UMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlal", 0b10>;
-defm UMLAL_MZZ : sme2_mla_long_array_single<"umlal",0b01, 0b10>;
-defm UMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlal", 0b10>;
-defm UMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlal", 0b10>;
-defm UMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlal", 0b10>;
-defm UMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlal", 0b10>;
-
-defm UMLSL_MZZI : sme2_mla_long_array_index<"umlsl", 0b11, 0b11>;
-defm UMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlsl", 0b11>;
-defm UMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlsl", 0b11>;
-defm UMLSL_MZZ : sme2_mla_long_array_single<"umlsl",0b01, 0b11>;
-defm UMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlsl", 0b11>;
-defm UMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlsl", 0b11>;
-defm UMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlsl", 0b11>;
-defm UMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlsl", 0b11>;
+defm FMLAL_MZZI : sme2_mla_long_array_index<"fmlal", 0b10, 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x1>;
+defm FMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x2>;
+defm FMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x4>;
+defm FMLAL_MZZ : sme2_mla_long_array_single<"fmlal", 0b00, 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x1>;
+defm FMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x2>;
+defm FMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x4>;
+defm FMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x2>;
+defm FMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x4>;
+
+defm FMLSL_MZZI : sme2_mla_long_array_index<"fmlsl", 0b10, 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x1>;
+defm FMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x2>;
+defm FMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x4>;
+defm FMLSL_MZZ : sme2_mla_long_array_single<"fmlsl", 0b00, 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x1>;
+defm FMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x2>;
+defm FMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x4>;
+defm FMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x2>;
+defm FMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlsl", 0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x4>;
+
+defm BFMLAL_MZZI : sme2_mla_long_array_index<"bfmlal", 0b10, 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x1>;
+defm BFMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x2>;
+defm BFMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x4>;
+defm BFMLAL_MZZ : sme2_mla_long_array_single<"bfmlal", 0b00, 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x1>;
+defm BFMLAL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x2>;
+defm BFMLAL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x4>;
+defm BFMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x2>;
+defm BFMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlal", 0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x4>;
+
+defm BFMLSL_MZZI : sme2_mla_long_array_index<"bfmlsl", 0b10, 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x1>;
+defm BFMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x2>;
+defm BFMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x4>;
+defm BFMLSL_MZZ : sme2_mla_long_array_single<"bfmlsl", 0b00, 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x1>;
+defm BFMLSL_VG2_M2ZZ : sme2_fp_mla_long_array_vg2_single<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x2>;
+defm BFMLSL_VG4_M4ZZ : sme2_fp_mla_long_array_vg4_single<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x4>;
+defm BFMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x2>;
+defm BFMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlsl", 0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x4>;
+
+defm SMLAL_MZZI : sme2_mla_long_array_index<"smlal", 0b11, 0b00, nxv8i16, int_aarch64_sme_smlal_lane_vg2x1>;
+defm SMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlal", 0b00, int_aarch64_sme_smlal_lane_vg2x2>;
+defm SMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlal", 0b00, int_aarch64_sme_smlal_lane_vg2x4>;
+defm SMLAL_MZZ : sme2_mla_long_array_single<"smlal",0b01, 0b00, nxv8i16, int_aarch64_sme_smlal_single_vg2x1>;
+defm SMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlal", 0b00, int_aarch64_sme_smlal_single_vg2x2>;
+defm SMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlal", 0b00, int_aarch64_sme_smlal_single_vg2x4>;
+defm SMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlal", 0b00, int_aarch64_sme_smlal_vg2x2>;
+defm SMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlal", 0b00, int_aarch64_sme_smlal_vg2x4>;
+
+defm SMLSL_MZZI : sme2_mla_long_array_index<"smlsl", 0b11, 0b01, nxv8i16, int_aarch64_sme_smlsl_lane_vg2x1>;
+defm SMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlsl", 0b01, int_aarch64_sme_smlsl_lane_vg2x2>;
+defm SMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"smlsl", 0b01, int_aarch64_sme_smlsl_lane_vg2x4>;
+defm SMLSL_MZZ : sme2_mla_long_array_single<"smlsl",0b01, 0b01, nxv8i16, int_aarch64_sme_smlsl_single_vg2x1>;
+defm SMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"smlsl", 0b01, int_aarch64_sme_smlsl_single_vg2x2>;
+defm SMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"smlsl", 0b01, int_aarch64_sme_smlsl_single_vg2x4>;
+defm SMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"smlsl", 0b01, int_aarch64_sme_smlsl_vg2x2>;
+defm SMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"smlsl", 0b01, int_aarch64_sme_smlsl_vg2x4>;
+
+defm UMLAL_MZZI : sme2_mla_long_array_index<"umlal", 0b11, 0b10, nxv8i16, int_aarch64_sme_umlal_lane_vg2x1>;
+defm UMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlal", 0b10, int_aarch64_sme_umlal_lane_vg2x2>;
+defm UMLAL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlal", 0b10, int_aarch64_sme_umlal_lane_vg2x4>;
+defm UMLAL_MZZ : sme2_mla_long_array_single<"umlal",0b01, 0b10, nxv8i16, int_aarch64_sme_umlal_single_vg2x1>;
+defm UMLAL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlal", 0b10, int_aarch64_sme_umlal_single_vg2x2>;
+defm UMLAL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlal", 0b10, int_aarch64_sme_umlal_single_vg2x4>;
+defm UMLAL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlal", 0b10, int_aarch64_sme_umlal_vg2x2>;
+defm UMLAL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlal", 0b10, int_aarch64_sme_umlal_vg2x4>;
+
+defm UMLSL_MZZI : sme2_mla_long_array_index<"umlsl", 0b11, 0b11, nxv8i16, int_aarch64_sme_umlsl_lane_vg2x1>;
+defm UMLSL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"umlsl", 0b11, int_aarch64_sme_umlsl_lane_vg2x2>;
+defm UMLSL_VG4_M4ZZI : sme2_int_mla_long_array_vg4_index<"umlsl", 0b11, int_aarch64_sme_umlsl_lane_vg2x4>;
+defm UMLSL_MZZ : sme2_mla_long_array_single<"umlsl",0b01, 0b11, nxv8i16, int_aarch64_sme_umlsl_single_vg2x1>;
+defm UMLSL_VG2_M2ZZ : sme2_int_mla_long_array_vg2_single<"umlsl", 0b11, int_aarch64_sme_umlsl_single_vg2x2>;
+defm UMLSL_VG4_M4ZZ : sme2_int_mla_long_array_vg4_single<"umlsl", 0b11, int_aarch64_sme_umlsl_single_vg2x4>;
+defm UMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlsl", 0b11, int_aarch64_sme_umlsl_vg2x2>;
+defm UMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlsl", 0b11, int_aarch64_sme_umlsl_vg2x4>;
defm FCVT_Z2Z_StoH : sme2_cvt_vg2_single<"fcvt", 0b0000>;
defm FCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"fcvtn", 0b0001>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 27f7c5003259..54abd7fa990d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -16,11 +16,14 @@ def imm_to_tile32 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAS0>", []>;
def imm_to_tile64 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAD0>", []>;
def imm_to_tile128 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAQ0>", []>;
-def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<4>", []>;
-def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<3>", []>;
-def tileslice32 : ComplexPattern<i32 , 2, "SelectSMETileSlice<2>", []>;
-def tileslice64 : ComplexPattern<i32 , 2, "SelectSMETileSlice<1>", []>;
-def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0>", []>; // nop
+def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<15, 1>", []>;
+def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<7, 1>", []>;
+def tileslice32 : ComplexPattern<i32 , 2, "SelectSMETileSlice<3, 1>", []>;
+def tileslice64 : ComplexPattern<i32 , 2, "SelectSMETileSlice<1, 1>", []>;
+def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0, 1>", []>; // nop
+
+def tileslicerange3s2 : ComplexPattern<i32, 2, "SelectSMETileSlice<14, 2>", []>;
+def tileslicerange2s2 : ComplexPattern<i32, 2, "SelectSMETileSlice<6, 2>", []>;
def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
@@ -78,6 +81,12 @@ class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, Regist
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
+class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
+ ValueType vt, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset, vt:$Zn, zpr_ty:$Zm)>;
+
+
class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm),
@@ -104,6 +113,12 @@ class SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<string name, SDPatternOperator intrinsic
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
(REG_SEQUENCE ZPR4Mul4, vt:$Zm1, zsub0, vt:$Zm2, zsub1, vt:$Zm3, zsub2, vt:$Zm4, zsub3))>;
+class SME2_ZA_TwoOp_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
+ Operand imm_ty, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset, vt:$Zn, zpr_ty:$Zm, (i32 imm_ty:$i))>;
+
+
class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
@@ -1674,7 +1689,7 @@ class sme2_mla_long_array_index_base<bits<2> op0, bits<2> op, Operand index_ty,
RegisterOperand multi_vector_ty,
string mnemonic, string vg_acronym="">
: I<(outs MatrixOp32:$ZAda),
- (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm, multi_vector_ty:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3),
+ (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm, multi_vector_ty:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3),
mnemonic, "\t$ZAda[$Rv, $imm" # !if(!eq(vg_acronym, ""), "", ", " # vg_acronym) # "], $Zn, $Zm$i3",
"", []>, Sched<[]> {
bits<4> Zm;
@@ -1691,9 +1706,9 @@ class sme2_mla_long_array_index_base<bits<2> op0, bits<2> op, Operand index_ty,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_mla_long_array_index<string mnemonic, bits<2> op0, bits<2> op> {
+multiclass sme2_mla_long_array_index<string mnemonic, bits<2> op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
def _S : sme2_mla_long_array_index_base<op0, op, uimm3s2range, ZPR16,
- mnemonic> {
+ mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
bits<3> i3;
bits<5> Zn;
bits<3> imm;
@@ -1702,6 +1717,10 @@ multiclass sme2_mla_long_array_index<string mnemonic, bits<2> op0, bits<2> op> {
let Inst{9-5} = Zn;
let Inst{2-0} = imm;
}
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm3s2range, ZPR16, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_Multi_Index_Pat<NAME # _S, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange3s2>;
}
class sme2_mla_long_array_vg2_index<string mnemonic, bits<2> op0, bits<2> op>
@@ -1718,18 +1737,26 @@ class sme2_mla_long_array_vg2_index<string mnemonic, bits<2> op0, bits<2> op>
let Inst{1-0} = imm;
}
-multiclass sme2_fp_mla_long_array_vg2_index<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg2_index<mnemonic, 0b10, op>;
+multiclass sme2_fp_mla_long_array_vg2_index<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg2_index<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
- (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>;
+ (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
}
-multiclass sme2_int_mla_long_array_vg2_index<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg2_index<mnemonic, 0b11, op>;
+multiclass sme2_int_mla_long_array_vg2_index<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg2_index<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, VectorIndexH32b_timm, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
- (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>;
+ (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
}
class sme2_mla_long_array_vg4_index<string mnemonic, bits<2> op0, bits<2> op>
@@ -1746,18 +1773,26 @@ class sme2_mla_long_array_vg4_index<string mnemonic, bits<2> op0, bits<2> op>
let Inst{1-0} = imm;
}
-multiclass sme2_fp_mla_long_array_vg4_index<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg4_index<mnemonic, 0b10, op>;
+multiclass sme2_fp_mla_long_array_vg4_index<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg4_index<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
- (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>;
+ (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
}
-multiclass sme2_int_mla_long_array_vg4_index<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg4_index<mnemonic, 0b11, op>;
+multiclass sme2_int_mla_long_array_vg4_index<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg4_index<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, VectorIndexH32b_timm, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
- (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i3), 0>;
+ (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
}
class sme2_mla_long_array<bits<2>op0, bits<2> op, Operand index_ty,
@@ -1782,9 +1817,9 @@ class sme2_mla_long_array<bits<2>op0, bits<2> op, Operand index_ty,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_mla_long_array_single<string mnemonic, bits<2> op0, bits<2> op> {
+multiclass sme2_mla_long_array_single<string mnemonic, bits<2> op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
def _S : sme2_mla_long_array<op0, op, uimm3s2range, ZPR16, ZPR4b16,
- mnemonic> {
+ mnemonic> , SMEPseudo2Instr<NAME # _S, 1>{
bits<4> Zm;
bits<5> Zn;
bits<3> imm;
@@ -1793,6 +1828,10 @@ multiclass sme2_mla_long_array_single<string mnemonic, bits<2> op0, bits<2> op>
let Inst{9-5} = Zn;
let Inst{2-0} = imm;
}
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm3s2range, ZPR16, ZPR4b16, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_Multi_Single_Pat<NAME # _S, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, tileslicerange3s2>;
}
class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op,
@@ -1810,33 +1849,49 @@ class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op,
let Inst{1-0} = imm;
}
-multiclass sme2_fp_mla_long_array_vg2_single<string mnemonic, bits<2> op> {
+multiclass sme2_fp_mla_long_array_vg2_single<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
def _S : sme2_mla_long_array_vg24_single<0b00, 0b0, op, ZZ_h, mnemonic,
- "vgx2">;
+ "vgx2">, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZ_h, ZPR4b16, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>;
}
-multiclass sme2_int_mla_long_array_vg2_single<string mnemonic, bits<2> op> {
+multiclass sme2_int_mla_long_array_vg2_single<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
def _S : sme2_mla_long_array_vg24_single<0b01, 0b0, op, ZZ_h, mnemonic,
- "vgx2">;
+ "vgx2">, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZ_h, ZPR4b16, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>;
}
-multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<2> op> {
+multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
def _S : sme2_mla_long_array_vg24_single<0b00, 0b1, op, ZZZZ_h, mnemonic,
- "vgx4">;
+ "vgx4">, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZZZ_h, ZPR4b16, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>;
}
-multiclass sme2_int_mla_long_array_vg4_single<string mnemonic, bits<2> op> {
+multiclass sme2_int_mla_long_array_vg4_single<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
def _S : sme2_mla_long_array_vg24_single<0b01, 0b1, op, ZZZZ_h, mnemonic,
- "vgx4">;
+ "vgx4">, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZZZ_h, ZPR4b16, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>;
@@ -1855,15 +1910,23 @@ class sme2_mla_long_array_vg2_multi<string mnemonic, bits<2> op0, bits<2> op>
let Inst{1-0} = imm;
}
-multiclass sme2_fp_mla_long_array_vg2_multi<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg2_multi<mnemonic, 0b10, op>;
+multiclass sme2_fp_mla_long_array_vg2_multi<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg2_multi<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, zpr_ty, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>;
}
-multiclass sme2_int_mla_long_array_vg2_multi<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg2_multi<mnemonic, 0b11, op>;
+multiclass sme2_int_mla_long_array_vg2_multi<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg2_multi<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, nxv8i16, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm2], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>;
@@ -1884,15 +1947,23 @@ class sme2_mla_long_array_vg4_multi<string mnemonic, bits<2> op0, bits<2> op>
let Inst{1-0} = imm;
}
-multiclass sme2_fp_mla_long_array_vg4_multi<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg4_multi<mnemonic, 0b10, op>;
+multiclass sme2_fp_mla_long_array_vg4_multi<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg4_multi<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, zpr_ty, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>;
}
-multiclass sme2_int_mla_long_array_vg4_multi<string mnemonic, bits<2> op> {
- def _S : sme2_mla_long_array_vg4_multi<mnemonic, 0b11, op>;
+multiclass sme2_int_mla_long_array_vg4_multi<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
+ def _S : sme2_mla_long_array_vg4_multi<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+
+ def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, nxv8i16, tileslicerange2s2>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm2], $Zn, $Zm",
(!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
new file mode 100644
index 000000000000..95527daa24be
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
@@ -0,0 +1,1322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -verify-machineinstrs < %s | FileCheck %s
+
+;
+; BF/F/S/UMLAL x1 (SINGLE)
+;
+
+define void @multi_vector_add_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x1_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x1_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x1_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x1_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x1 (SINGLE)
+;
+
+define void @multi_vector_sub_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x1_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x1_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x1_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x1_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h
+; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x2 (SINGLE)
+;
+
+define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x2_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x2_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x2 (SINGLE)
+;
+
+define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x2_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x2_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x4 (SINGLE)
+;
+
+define void @multi_vector_add_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2,
+ <vscale x 8 x half> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2,
+ <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x4_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg2x4_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x4 (SINGLE)
+;
+
+define void @multi_vector_sub_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x4_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg2x4_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x2 (MULTI)
+;
+
+define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) {
+; CHECK-LABEL: multi_vector_add_multi_vg2x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
+ ret void
+}
+
+define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
+; CHECK-LABEL: multi_vector_add_multi_vg2x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
+ ret void
+}
+
+define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_add_multi_vg2x2_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ ret void
+}
+
+define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_add_multi_vg2x2_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x2 (MULTI)
+;
+
+define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) {
+; CHECK-LABEL: multi_vector_sub_multi_vg2x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
+ ret void
+}
+
+define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
+; CHECK-LABEL: multi_vector_sub_multi_vg2x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
+ ret void
+}
+
+define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_sub_multi_vg2x2_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ ret void
+}
+
+define void @multi_vector_sub_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
+; CHECK-LABEL: multi_vector_sub_multi_vg2x2_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x4 (MULTI)
+;
+
+define void @multi_vector_add_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+; CHECK-LABEL: multi_vector_add_multi_vg2x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) {
+ call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
+ ret void
+}
+
+define void @multi_vector_add_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+; CHECK-LABEL: multi_vector_add_multi_vg2x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) {
+ call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
+ ret void
+}
+
+define void @multi_vector_add_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+; CHECK-LABEL: multi_vector_add_multi_vg2x4_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+ call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ ret void
+}
+
+define void @multi_vector_add_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+; CHECK-LABEL: multi_vector_add_multi_vg2x4_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+ call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x4 (MULTI)
+;
+
+define void @multi_vector_sub_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+; CHECK-LABEL: multi_vector_sub_multi_vg2x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) {
+ call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
+ ret void
+}
+
+define void @multi_vector_sub_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+; CHECK-LABEL: multi_vector_sub_multi_vg2x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) {
+ call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
+ ret void
+}
+
+define void @multi_vector_sub_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+; CHECK-LABEL: multi_vector_sub_multi_vg2x4_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+ call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ ret void
+}
+
+define void @multi_vector_sub_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+; CHECK-LABEL: multi_vector_sub_multi_vg2x4_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT: ret
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
+ call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x1 (INDEXED)
+;
+
+define void @multi_vector_add_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x1_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: fmlal za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: fmlal za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x1_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: bfmlal za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: bfmlal za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x1_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: smlal za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: smlal za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x1_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: umlal za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: umlal za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x1 (INDEXED)
+;
+
+define void @multi_vector_sub_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x1_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: fmlsl za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: fmlsl za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x1_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: bfmlsl za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x1_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: smlsl za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: smlsl za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x1_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: umlsl za.s[w8, 0:1], z0.h, z1.h[0]
+; CHECK-NEXT: umlsl za.s[w8, 14:15], z0.h, z1.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
+ %slice.14 = add i32 %slice, 14
+ call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x2 (INDEXED)
+;
+
+define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x2_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x2_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x2 (INDEXED)
+;
+
+define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x2_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x2_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
+; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+;
+; BF/F/S/UMLAL x4 (INDEXED)
+;
+
+define void @multi_vector_add_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x4_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg2x4_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+;
+; BF/F/S/UMLSL x4 (INDEXED)
+;
+
+define void @multi_vector_sub_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x4_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice.6,
+ <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
+ <vscale x 8 x half> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x4_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice.6,
+ <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
+ <vscale x 8 x bfloat> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x4_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg2x4_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT: umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 0)
+ %slice.6 = add i32 %slice, 6
+ call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice.6,
+ <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
+ <vscale x 8 x i16> %zm, i32 7)
+ ret void
+}
+
+declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+
+declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+
+declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+
+declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+
+declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+
+declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
More information about the llvm-commits
mailing list