[llvm] cfd3a0e - [AArch64][SME2] Add multi-vector fused multiply-add/subtract intrinsics
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 20 03:14:52 PST 2023
Author: Kerry McLaughlin
Date: 2023-01-20T11:14:41Z
New Revision: cfd3a0e04ad60c72dda79d407a78f4fff0bdad6d
URL: https://github.com/llvm/llvm-project/commit/cfd3a0e04ad60c72dda79d407a78f4fff0bdad6d
DIFF: https://github.com/llvm/llvm-project/commit/cfd3a0e04ad60c72dda79d407a78f4fff0bdad6d.diff
LOG: [AArch64][SME2] Add multi-vector fused multiply-add/subtract intrinsics
Adds intrinsics for the following:
- fmla (single, multi & indexed)
- fmls (single, multi & indexed)
NOTE: These intrinsics are still in development and are subject
to future changes.
Reviewed By: CarolineConcatto
Differential Revision: https://reviews.llvm.org/D141946
Added:
llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
llvm/lib/Target/AArch64/SMEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 172d844598528..5a28a6eb03702 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2692,4 +2692,69 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
}
+
+ //
+ // SME2 Intrinsics
+ //
+
+ class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>],
+ []>;
+
+ class SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>],
+ []>;
+
+ class SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ []>;
+
+ class SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+ []>;
+
+ class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>, llvm_i32_ty],
+ [ImmArg<ArgIndex<4>>]>;
+
+ class SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, llvm_i32_ty],
+ [ImmArg<ArgIndex<6>>]>;
+
+ //
+ // Multi-vector fused multiply-add/subtract
+ //
+
+ def int_aarch64_sme_fmla_single_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_fmls_single_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_fmla_single_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sme_fmls_single_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+ def int_aarch64_sme_fmla_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_fmls_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_fmla_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_fmls_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+
+ def int_aarch64_sme_fmla_lane_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
+ def int_aarch64_sme_fmls_lane_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
+ def int_aarch64_sme_fmla_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
+ def int_aarch64_sme_fmls_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eaf467d7831a1..55d6da056a49e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2698,13 +2698,19 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
MachineBasicBlock *
AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
- MachineBasicBlock *BB) const {
+ MachineBasicBlock *BB, bool HasTile) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+ unsigned StartIdx = 0;
- MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
- MIB.addReg(BaseReg + MI.getOperand(0).getImm());
- for (unsigned I = 1; I < MI.getNumOperands(); ++I)
+ if (HasTile) {
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+ StartIdx = 1;
+ } else
+ MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
+
+ for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
MIB.add(MI.getOperand(I));
MI.eraseFromParent(); // The pseudo is gone now.
@@ -2737,16 +2743,18 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
uint64_t SMEMatrixType =
TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
switch (SMEMatrixType) {
+ case (AArch64::SMEMatrixArray):
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
case (AArch64::SMEMatrixTileB):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
case (AArch64::SMEMatrixTileH):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
case (AArch64::SMEMatrixTileS):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
case (AArch64::SMEMatrixTileD):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
case (AArch64::SMEMatrixTileQ):
- return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
+ return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3731c5ae2408a..99965ce1902e1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -596,7 +596,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
- MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineInstr &MI, MachineBasicBlock *BB,
+ bool HasTile) const;
MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index df08e5ec758c3..e8a186057ff15 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -249,30 +249,30 @@ def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))),
let Predicates = [HasSME2] in {
defm ADD_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b0011010, MatrixOp32, ZZ_s, ZPR4b32>;
defm ADD_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b0111010, MatrixOp32, ZZZZ_s, ZPR4b32>;
-defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b011010, MatrixOp32, ZZ_s_mul_r>;
-defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b011010, MatrixOp32, ZZZZ_s_mul_r>;
+defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b011010, MatrixOp32, ZZ_s_mul_r, nxv4i32, null_frag>;
+defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b011010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, null_frag>;
defm ADD_VG2_2ZZ : sme2_int_sve_destructive_vector_vg2_single<"add", 0b0110000>;
defm ADD_VG4_4ZZ : sme2_int_sve_destructive_vector_vg4_single<"add", 0b0110000>;
defm SUB_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b0011011, MatrixOp32, ZZ_s, ZPR4b32>;
defm SUB_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b0111011, MatrixOp32, ZZZZ_s, ZPR4b32>;
-defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b011011, MatrixOp32, ZZ_s_mul_r>;
-defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b011011, MatrixOp32, ZZZZ_s_mul_r>;
-
-defm FMLA_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011000, MatrixOp32, ZZ_s, ZPR4b32>;
-defm FMLA_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111000, MatrixOp32, ZZZZ_s, ZPR4b32>;
-defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b011000, MatrixOp32, ZZ_s_mul_r>;
-defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b011000, MatrixOp32, ZZZZ_s_mul_r>;
-defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b0000, ZZ_s_mul_r, ZPR4b32>;
-defm FMLA_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmla", 0b0000, ZZZZ_s_mul_r, ZPR4b32>;
-
-defm FMLS_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011001, MatrixOp32, ZZ_s, ZPR4b32>;
-defm FMLS_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111001, MatrixOp32, ZZZZ_s, ZPR4b32>;
-defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b011001, MatrixOp32, ZZ_s_mul_r>;
-defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b011001, MatrixOp32, ZZZZ_s_mul_r>;
-defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32>;
-defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32>;
+defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b011011, MatrixOp32, ZZ_s_mul_r, nxv4i32, null_frag>;
+defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b011011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, null_frag>;
+
+defm FMLA_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011000, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x2>;
+defm FMLA_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111000, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x4>;
+defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b011000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b011000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x4>;
+defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b0000, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x2>;
+defm FMLA_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmla", 0b0000, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x4>;
+
+defm FMLS_VG2_M2ZZ_S : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011001, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x2>;
+defm FMLS_VG4_M4ZZ_S : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111001, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x4>;
+defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b011001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b011001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x4>;
+defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>;
+defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x4>;
defm ADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"add", 0b0010, MatrixOp32, ZZ_s_mul_r>;
defm ADD_VG4_M4Z_S : sme2_multivec_accum_add_sub_vg4<"add", 0b0010, MatrixOp32, ZZZZ_s_mul_r>;
@@ -446,71 +446,71 @@ defm SCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"sclamp", 0b0>;
defm UCLAMP_VG2_2Z2Z : sme2_int_clamp_vector_vg2_multi<"uclamp", 0b1>;
defm UCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"uclamp", 0b1>;
-defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b1001, ZZ_h_mul_r, ZPR4b16>;
-defm FDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b1001, ZZZZ_h_mul_r, ZPR4b16>;
+defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b1001, ZZ_h_mul_r, ZPR4b16, nxv8f16, null_frag>;
+defm FDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b1001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, null_frag>;
defm FDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010000, MatrixOp32, ZZ_h, ZPR4b16>;
defm FDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110000, MatrixOp32, ZZZZ_h, ZPR4b16>;
-defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b010000, MatrixOp32, ZZ_h_mul_r>;
-defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b010000, MatrixOp32, ZZZZ_h_mul_r>;
+defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b010000, MatrixOp32, ZZ_h_mul_r, nxv8f16, null_frag>;
+defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b010000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, null_frag>;
-defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b1011, ZZ_h_mul_r, ZPR4b16>;
-defm BFDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"bfdot", 0b1011, ZZZZ_h_mul_r, ZPR4b16>;
+defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b1011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, null_frag>;
+defm BFDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"bfdot", 0b1011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, null_frag>;
defm BFDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"bfdot", 0b0010010, MatrixOp32, ZZ_h, ZPR4b16>;
defm BFDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"bfdot", 0b0110010, MatrixOp32, ZZZZ_h, ZPR4b16>;
-defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot", 0b010010, MatrixOp32, ZZ_h_mul_r>;
-defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot", 0b010010, MatrixOp32, ZZZZ_h_mul_r>;
+defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot", 0b010010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot", 0b010010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b0011, ZZ_h_mul_r, ZPR4b16>;
+defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b0011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, null_frag>;
-defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b0001, ZZ_h_mul_r, ZPR4b16>;
+defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b0001, ZZ_h_mul_r, ZPR4b16, nxv8f16, null_frag>;
-defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1000, ZZ_h_mul_r, ZPR4b16>;
-defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1100, ZZ_b_mul_r, ZPR4b8>;
-defm SDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1000, ZZZZ_h_mul_r, ZPR4b16>;
-defm SDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1100, ZZZZ_b_mul_r, ZPR4b8>;
+defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1000, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1100, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm SDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1000, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm SDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
defm SDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1010101, MatrixOp32, ZZ_h, ZPR4b16>;
defm SDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1110101, MatrixOp32, ZZZZ_h, ZPR4b16>;
-defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110101, MatrixOp32, ZZ_h_mul_r>;
-defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110101, MatrixOp32, ZZZZ_h_mul_r>;
+defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110101, MatrixOp32, ZZ_h_mul_r, nxv8i16, null_frag>;
+defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110101, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, null_frag>;
defm SDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b0010100, MatrixOp32, ZZ_b, ZPR4b8>;
defm SDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b0110100, MatrixOp32, ZZZZ_b, ZPR4b8>;
-defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b010100, MatrixOp32, ZZ_b_mul_r>;
-defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b010100, MatrixOp32, ZZZZ_b_mul_r>;
+defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b010100, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b010100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
-defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b1111, ZZ_b_mul_r, ZPR4b8>;
-defm SUDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sudot", 0b1111, ZZZZ_b_mul_r, ZPR4b8>;
+defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b1111, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm SUDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sudot", 0b1111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
defm SUDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"sudot", 0b0010111, MatrixOp32, ZZ_b, ZPR4b8>;
defm SUDOT_VG4_M4ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"sudot", 0b0110111, MatrixOp32, ZZZZ_b, ZPR4b8>;
-defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b0100, ZZ_h_mul_r, ZPR4b16>;
-defm SVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"svdot", 0b0100, ZZZZ_b_mul_r, ZPR4b8>;
+defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b0100, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm SVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"svdot", 0b0100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
-defm SUVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"suvdot", 0b0111, ZZZZ_b_mul_r, ZPR4b8>;
+defm SUVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"suvdot", 0b0111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
-defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1010, ZZ_h_mul_r, ZPR4b16>;
-defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1110, ZZ_b_mul_r, ZPR4b8>;
-defm UDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1110, ZZZZ_b_mul_r, ZPR4b8>;
-defm UDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1010, ZZZZ_h_mul_r, ZPR4b16>;
+defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1010, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1110, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm UDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm UDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1010, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
defm UDOT_VG2_M2ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1010111, MatrixOp32, ZZ_h, ZPR4b16>;
defm UDOT_VG4_M4ZZ_HtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1110111, MatrixOp32, ZZZZ_h, ZPR4b16>;
-defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110111, MatrixOp32, ZZ_h_mul_r>;
-defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110111, MatrixOp32, ZZZZ_h_mul_r>;
+defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110111, MatrixOp32, ZZ_h_mul_r, nxv8i16, null_frag>;
+defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110111, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, null_frag>;
defm UDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b0010110, MatrixOp32, ZZ_b, ZPR4b8>;
defm UDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b0110110, MatrixOp32, ZZZZ_b, ZPR4b8>;
-defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b010110, MatrixOp32, ZZ_b_mul_r>;
-defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b010110, MatrixOp32, ZZZZ_b_mul_r>;
+defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b010110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b010110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
-defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b1101, ZZ_b_mul_r, ZPR4b8>;
-defm USDOT_VG4_M4ZZI_BToS: sme2_multi_vec_array_vg4_index_32b<"usdot", 0b1101, ZZZZ_b_mul_r, ZPR4b8>;
+defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b1101, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm USDOT_VG4_M4ZZI_BToS: sme2_multi_vec_array_vg4_index_32b<"usdot", 0b1101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
defm USDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"usdot", 0b0010101, MatrixOp32, ZZ_b, ZPR4b8>;
defm USDOT_VG4_M4ZZ_BToS : sme2_dot_mla_add_sub_array_vg24_single<"usdot", 0b0110101, MatrixOp32, ZZZZ_b, ZPR4b8>;
-defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b010101, MatrixOp32, ZZ_b_mul_r>;
-defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b010101, MatrixOp32, ZZZZ_b_mul_r>;
+defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b010101, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b010101, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
-defm USVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"usvdot", 0b0101, ZZZZ_b_mul_r, ZPR4b8>;
+defm USVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"usvdot", 0b0101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
-defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b0110, ZZ_h_mul_r, ZPR4b16>;
-defm UVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"uvdot", 0b0110, ZZZZ_b_mul_r, ZPR4b8>;
+defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b0110, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm UVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"uvdot", 0b0110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
def SMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"smlall", 0b000>;
defm SMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlall", 0b000>;
@@ -707,13 +707,13 @@ defm STNT1D_VG4_M4ZPXI : sme2_st_vector_vg4_multi_scalar_immediate<0b11, 0b1, ZZ
let Predicates = [HasSME2, HasSMEI16I64] in {
defm ADD_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64>;
defm ADD_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"add", 0b1111010, MatrixOp64, ZZZZ_d, ZPR4b64>;
-defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b111010, MatrixOp64, ZZ_d_mul_r>;
-defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b111010, MatrixOp64, ZZZZ_d_mul_r>;
+defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b111010, MatrixOp64, ZZ_d_mul_r, nxv2i64, null_frag>;
+defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b111010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, null_frag>;
defm SUB_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b1011011, MatrixOp64, ZZ_d, ZPR4b64>;
defm SUB_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"sub", 0b1111011, MatrixOp64, ZZZZ_d, ZPR4b64>;
-defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r>;
-defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r>;
+defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r, nxv2i64, null_frag>;
+defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, null_frag>;
defm ADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"add", 0b1010, MatrixOp64, ZZ_d_mul_r>;
defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64, ZZZZ_d_mul_r>;
@@ -721,23 +721,23 @@ defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64,
defm SUB_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"sub", 0b1011, MatrixOp64, ZZ_d_mul_r>;
defm SUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"sub", 0b1011, MatrixOp64, ZZZZ_d_mul_r>;
-defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h_mul_r, ZPR4b16>;
-defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16>;
+defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
defm SDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1010100, MatrixOp64, ZZ_h, ZPR4b16>;
defm SDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"sdot", 0b1110100, MatrixOp64, ZZZZ_h, ZPR4b16>;
-defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110100, MatrixOp64, ZZ_h_mul_r>;
-defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110100, MatrixOp64, ZZZZ_h_mul_r>;
+defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110100, MatrixOp64, ZZ_h_mul_r, nxv8i16, null_frag>;
+defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, null_frag>;
-defm SVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"svdot", 0b101, ZZZZ_h_mul_r, ZPR4b16>;
+defm SVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"svdot", 0b101, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
-defm UDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"udot", 0b11, ZZ_h_mul_r, ZPR4b16>;
-defm UDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"udot", 0b011, ZZZZ_h_mul_r, ZPR4b16>;
+defm UDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"udot", 0b11, ZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
+defm UDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"udot", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
defm UDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1010110, MatrixOp64, ZZ_h, ZPR4b16>;
defm UDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg24_single<"udot", 0b1110110, MatrixOp64, ZZZZ_h, ZPR4b16>;
-defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110110, MatrixOp64, ZZ_h_mul_r>;
-defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110110, MatrixOp64, ZZZZ_h_mul_r>;
+defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110110, MatrixOp64, ZZ_h_mul_r, nxv8i16, null_frag>;
+defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, null_frag>;
-defm UVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"uvdot", 0b111, ZZZZ_h_mul_r, ZPR4b16>;
+defm UVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"uvdot", 0b111, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, null_frag>;
def SMLALL_MZZI_HtoD : sme2_mla_ll_array_index_64b<"smlall", 0b00>;
defm SMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlall", 0b00>;
@@ -777,19 +777,19 @@ defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll", 0b1110, Matr
}
let Predicates = [HasSME2, HasSMEF64F64] in {
-defm FMLA_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmla", 0b00, ZZ_d_mul_r, ZPR4b64>;
-defm FMLA_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmla", 0b000, ZZZZ_d_mul_r, ZPR4b64>;
-defm FMLA_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b1011000, MatrixOp64, ZZ_d, ZPR4b64>;
-defm FMLA_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b1111000, MatrixOp64, ZZZZ_d, ZPR4b64>;
-defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b111000, MatrixOp64, ZZ_d_mul_r>;
-defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b111000, MatrixOp64, ZZZZ_d_mul_r>;
-
-defm FMLS_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmls", 0b10, ZZ_d_mul_r, ZPR4b64>;
-defm FMLS_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmls", 0b010, ZZZZ_d_mul_r, ZPR4b64>;
-defm FMLS_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b1011001, MatrixOp64, ZZ_d, ZPR4b64>;
-defm FMLS_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b1111001, MatrixOp64, ZZZZ_d, ZPR4b64>;
-defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r>;
-defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r>;
+defm FMLA_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmla", 0b00, ZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_lane_vg1x2>;
+defm FMLA_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmla", 0b000, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_lane_vg1x4>;
+defm FMLA_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b1011000, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x2>;
+defm FMLA_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b1111000, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x4>;
+defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b111000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b111000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x4>;
+
+defm FMLS_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmls", 0b10, ZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x2>;
+defm FMLS_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmls", 0b010, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x4>;
+defm FMLS_VG2_M2ZZ_D : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b1011001, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x2>;
+defm FMLS_VG4_M4ZZ_D : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b1111001, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x4>;
+defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>;
defm FADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fadd", 0b1000, MatrixOp64, ZZ_d_mul_r>;
defm FADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fadd", 0b1000, MatrixOp64, ZZZZ_d_mul_r>;
@@ -824,15 +824,15 @@ defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00>;
defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b00>;
defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b010001, MatrixOp16, ZZ_h_mul_r>;
-defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b010001, MatrixOp16, ZZZZ_h_mul_r>;
+defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b010001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b010001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b01>;
defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b01>;
defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>;
defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b010011, MatrixOp16, ZZ_h_mul_r>;
-defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b010011, MatrixOp16, ZZZZ_h_mul_r>;
+defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b010011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b010011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
@@ -851,15 +851,15 @@ defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b10>;
defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b10>;
defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>;
defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b110001, MatrixOp16, ZZ_h_mul_r>;
-defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b110001, MatrixOp16, ZZZZ_h_mul_r>;
+defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b110001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b110001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b11>;
defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b11>;
defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>;
defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b110011, MatrixOp16, ZZ_h_mul_r>;
-defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b110011, MatrixOp16, ZZZZ_h_mul_r>;
+defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b110011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b110011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 3556d7fcfbe0b..27f7c5003259d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -50,6 +50,74 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
let usesCustomInserter = 1;
}
+class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
+ ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), []> {
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
+class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
+ SMEMatrixTypeEnum za_flag>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), []> {
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
+class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
+ ZPRRegOp zpr_ty, Operand imm_ty, SMEMatrixTypeEnum za_flag>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, imm_ty:$i), []> {
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SME pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
+ ValueType vt, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
+ zpr_ty:$Zm)>;
+class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
+ ValueType vt, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ zpr_ty:$Zm)>;
+
+class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm1, vt:$Zm2),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>;
+
+class SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm1, vt:$Zm2, vt:$Zm3, vt:$Zm4),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zm1, zsub0, vt:$Zm2, zsub1, vt:$Zm3, zsub2, vt:$Zm4, zsub3))>;
+
+class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
+ Operand imm_ty, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
+
+class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
+ Operand imm_ty, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ zpr_ty:$Zm, imm_ty:$i)>;
+
//===----------------------------------------------------------------------===//
// SME Outer Products
//===----------------------------------------------------------------------===//
@@ -1253,10 +1321,38 @@ multiclass sme2_dot_mla_add_sub_array_vg24_single<string mnemonic, bits<7> op,
MatrixOperand matrix_ty,
RegisterOperand multi_vector_ty,
ZPRRegOp zpr_ty>{
- def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_ty, multi_vector_ty, zpr_ty, mnemonic>;
+ def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_ty, multi_vector_ty, zpr_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+ def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
+ (!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), 0>;
+}
+
+multiclass sme2_dot_mla_add_sub_array_vg2_single<string mnemonic, bits<7> op,
+ MatrixOperand matrix_ty,
+ RegisterOperand multi_vector_ty,
+ ZPRRegOp zpr_ty, ValueType vty, SDPatternOperator intrinsic>{
+ def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_ty, multi_vector_ty, zpr_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+ def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
+ (!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), 0>;
+
+ def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, zpr_ty, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, zpr_ty, vty, tileslice16>;
+}
+
+multiclass sme2_dot_mla_add_sub_array_vg4_single<string mnemonic, bits<7> op,
+ MatrixOperand matrix_ty,
+ RegisterOperand multi_vector_ty,
+ ZPRRegOp zpr_ty, ValueType vty, SDPatternOperator intrinsic>{
+ def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_ty, multi_vector_ty, zpr_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
(!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), 0>;
+
+ def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, zpr_ty, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, zpr_ty, vty, tileslice16>;
}
//===----------------------------------------------------------------------===//
@@ -1290,12 +1386,16 @@ class sme2_dot_mla_add_sub_array_vg2_multi<bits<6> op,
multiclass sme2_dot_mla_add_sub_array_vg2_multi<string mnemonic, bits<6> op,
MatrixOperand matrix_ty,
- RegisterOperand multi_vector_ty>{
- def NAME : sme2_dot_mla_add_sub_array_vg2_multi<op, matrix_ty, multi_vector_ty, mnemonic>;
+ RegisterOperand multi_vector_ty, ValueType zpr_ty,
+ SDPatternOperator intrinsic> {
+ def NAME : sme2_dot_mla_add_sub_array_vg2_multi<op, matrix_ty, multi_vector_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
- def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
- (!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
+ def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, zpr_ty, tileslice16>;
+ def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
+ (!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
}
class sme2_dot_mla_add_sub_array_vg4_multi<bits<6> op,
@@ -1325,15 +1425,18 @@ class sme2_dot_mla_add_sub_array_vg4_multi<bits<6> op,
let Constraints = "$ZAd = $_ZAd";
}
-
multiclass sme2_dot_mla_add_sub_array_vg4_multi<string mnemonic, bits<6> op,
MatrixOperand matrix_ty,
- RegisterOperand multi_vector_ty>{
- def NAME : sme2_dot_mla_add_sub_array_vg4_multi<op, matrix_ty, multi_vector_ty, mnemonic>;
+ RegisterOperand multi_vector_ty,
+ ValueType zpr_ty, SDPatternOperator intrinsic>{
+ def NAME : sme2_dot_mla_add_sub_array_vg4_multi<op, matrix_ty, multi_vector_ty, mnemonic>, SMEPseudo2Instr<NAME, 1>;
- def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
- (!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
+ def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, zpr_ty, tileslice16>;
+ def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
+ (!cast<Instruction>(NAME) matrix_ty:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
}
//===----------------------------------------------------------------------===//
@@ -2108,15 +2211,21 @@ class sme2_multi_vec_array_vg2_index<bit sz, bits<6> op, MatrixOperand matrix_ty
// SME2 multi-vec ternary indexed two registers 32-bit
multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<4> op,
RegisterOperand multi_vector_ty,
- ZPRRegOp vector_ty> {
+ ZPRRegOp vector_ty, ValueType vt,
+ SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg2_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, vector_ty,
- VectorIndexS, mnemonic> {
+ VectorIndexS32b_timm, mnemonic>, SMEPseudo2Instr<NAME, 1> {
bits<2> i;
let Inst{11-10} = i;
}
+
+ def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
+
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
- multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS:$i), 0>;
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>;
}
// SME2.1 multi-vec ternary indexed two registers 16-bit
@@ -2141,7 +2250,7 @@ class sme2_multi_vec_array_vg2_index_64b<bits<2> op,
string mnemonic>
: I<(outs MatrixOp64:$ZAda),
(ins MatrixOp64:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
- multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1),
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1),
mnemonic, "\t$ZAda[$Rv, $imm3, vgx2], $Zn, $Zm$i1",
"", []>, Sched<[]> {
bits<4> Zm;
@@ -2165,13 +2274,18 @@ class sme2_multi_vec_array_vg2_index_64b<bits<2> op,
multiclass sme2_multi_vec_array_vg2_index_64b<string mnemonic, bits<2> op,
RegisterOperand multi_vector_ty,
- ZPRRegOp vector_ty> {
+ ZPRRegOp vector_ty, ValueType vt,
+ SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg2_index_64b<op, multi_vector_ty, vector_ty,
- mnemonic>;
+ mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+ def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexD32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexD32b_timm, tileslice16>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i1",
(!cast<Instruction>(NAME) MatrixOp64:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
- multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1), 0>;
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>;
}
class sme2_multi_vec_array_vg4_index<bit sz, bits<6> op, MatrixOperand matrix_ty,
@@ -2205,16 +2319,21 @@ class sme2_multi_vec_array_vg4_index<bit sz, bits<6> op, MatrixOperand matrix_ty
// SME2 multi-vec ternary indexed four registers 32-bit
multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
RegisterOperand multi_vector_ty,
- ZPRRegOp vector_ty> {
+ ZPRRegOp vector_ty, ValueType vt,
+ SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg4_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty,
- vector_ty, VectorIndexS, mnemonic>{
+ vector_ty, VectorIndexS32b_timm, mnemonic>, SMEPseudo2Instr<NAME, 1> {
bits<2> i;
let Inst{11-10} = i;
}
+ def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
+
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
- multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS:$i), 0>;
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>;
}
// SME2.1 multi-vec ternary indexed four registers 16-bit
@@ -2239,7 +2358,7 @@ class sme2_multi_vec_array_vg4_index_64b<bits<3> op,
string mnemonic>
: I<(outs MatrixOp64:$ZAda),
(ins MatrixOp64:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
- multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1),
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1),
mnemonic, "\t$ZAda[$Rv, $imm3, vgx4], $Zn, $Zm$i1",
"", []>, Sched<[]> {
bits<4> Zm;
@@ -2264,13 +2383,18 @@ class sme2_multi_vec_array_vg4_index_64b<bits<3> op,
multiclass sme2_multi_vec_array_vg4_index_64b<string mnemonic, bits<3> op,
RegisterOperand multi_vector_ty,
- ZPRRegOp vector_ty> {
+ ZPRRegOp vector_ty, ValueType vty,
+ SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg4_index_64b<op, multi_vector_ty, vector_ty,
- mnemonic>;
+ mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+ def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexD32b_timm, SMEMatrixArray>;
+
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vty, VectorIndexD32b_timm, tileslice16>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i1",
(!cast<Instruction>(NAME) MatrixOp64:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
- multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD:$i1), 0>;
+ multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>;
}
//===----------------------------------------------------------------------===//
// SME2 multi-vec indexed long long MLA one source 32-bit
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
new file mode 100644
index 0000000000000..016fb3c50cd9e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
@@ -0,0 +1,658 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -verify-machineinstrs | FileCheck %s
+
+; FMLA (SINGLE)
+
+define void @multi_vector_add_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg1x2_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
+; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
+; CHECK-LABEL: multi_vector_add_single_vg1x2_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
+; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_add_single_vg1x4_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
+; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm) {
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @multi_vector_add_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+; CHECK-LABEL: multi_vector_add_single_vg1x4_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
+; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm) {
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm)
+ ret void
+}
+
+; FMLS (SINGLE)
+
+define void @multi_vector_sub_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg1x2_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
+; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
+; CHECK-LABEL: multi_vector_sub_single_vg1x2_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
+; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_sub_single_vg1x4_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
+; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm) {
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @multi_vector_sub_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+; CHECK-LABEL: multi_vector_sub_single_vg1x4_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
+; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm) {
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm)
+ ret void
+}
+
+; FMLA (MULTI)
+
+define void @multi_vector_add_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+; CHECK-LABEL: multi_vector_add_vg1x2_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ ret void
+}
+
+define void @multi_vector_add_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+; CHECK-LABEL: multi_vector_add_vg1x2_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
+; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ ret void
+}
+
+; Test to ensure the correct register class is used (first register in the list should be a multiple of 2)
+define void @multi_vector_add_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+; CHECK-LABEL: multi_vector_add_vg1x2_s_regclass:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov z4.d, z3.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z6.s, z7.s }, { z4.s, z5.s }
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1) {
+ call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm0)
+ ret void
+}
+
+define void @multi_vector_add_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_add_vg1x4_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
+ ret void
+}
+
+define void @multi_vector_add_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+; CHECK-LABEL: multi_vector_add_vg1x4_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
+; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
+ ret void
+}
+
+; Test to ensure the correct register class is used (first register in the list should be a multiple of 4)
+define void @multi_vector_add_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_add_vg1x4_s_regclass:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z26.d, z7.d
+; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov z25.d, z6.d
+; CHECK-NEXT: mov z29.d, z2.d
+; CHECK-NEXT: mov z24.d, z5.d
+; CHECK-NEXT: mov z28.d, z1.d
+; CHECK-NEXT: mov z27.d, z4.d
+; CHECK-NEXT: mov z31.d, z0.d
+; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z28.s - z31.s }, { z24.s - z27.s }
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3) {
+ call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm0)
+ ret void
+}
+
+; FMLS (MULTI)
+
+define void @multi_vector_sub_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+; CHECK-LABEL: multi_vector_sub_vg1x2_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
+ ret void
+}
+
+define void @multi_vector_sub_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+; CHECK-LABEL: multi_vector_sub_vg1x2_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
+; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
+ ret void
+}
+
+define void @multi_vector_sub_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_sub_vg1x4_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
+ ret void
+}
+
+define void @multi_vector_sub_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+; CHECK-LABEL: multi_vector_sub_vg1x4_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
+; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
+ ret void
+}
+
+; FMLA (INDEXED)
+
+define void @multi_vector_add_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg1x2_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
+; CHECK-NEXT: fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm, i32 3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm, i32 3)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg1x2_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
+; CHECK-NEXT: fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm, i32 1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm, i32 1)
+ ret void
+}
+
+; Test to ensure the correct register class is used (first register in the list should be a multiple of 2)
+define void @multi_vector_add_lane_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: multi_vector_add_lane_vg1x2_s_regclass:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z0.d
+; CHECK-NEXT: fmla za.s[w8, 0, vgx2], { z4.s, z5.s }, z2.s[3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0,
+ <vscale x 4 x float> %zm, i32 3)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_add_lane_vg1x4_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3]
+; CHECK-NEXT: fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3]
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm) {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm, i32 3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm, i32 3)
+ ret void
+}
+
+define void @multi_vector_add_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+; CHECK-LABEL: multi_vector_add_lane_vg1x4_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1]
+; CHECK-NEXT: fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1]
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm) {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm, i32 1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm, i32 1)
+ ret void
+}
+
+; Test to ensure the correct register class is used (first register in the list should be a multiple of 4)
+define void @multi_vector_add_lane_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_add_lane_vg1x4_s_regclass:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: mov z27.d, z0.d
+; CHECK-NEXT: fmla za.s[w8, 0, vgx4], { z24.s - z27.s }, z4.s[3]
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm) {
+ call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2,
+ <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0,
+ <vscale x 4 x float> %zm, i32 3)
+ ret void
+}
+
+; FMLS (INDEXED)
+
+define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg1x2_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
+; CHECK-NEXT: fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm, i32 3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zm, i32 3)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
+; CHECK-LABEL: multi_vector_sub_lane_vg1x2_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
+; CHECK-NEXT: fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm, i32 1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zm, i32 1)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+; CHECK-LABEL: multi_vector_sub_lane_vg1x4_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3]
+; CHECK-NEXT: fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3]
+; CHECK-NEXT: ret
+ <vscale x 4 x float> %zm) {
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm, i32 3)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice.7,
+ <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+ <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
+ <vscale x 4 x float> %zm, i32 3)
+ ret void
+}
+
+define void @multi_vector_sub_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+; CHECK-LABEL: multi_vector_sub_lane_vg1x4_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT: fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1]
+; CHECK-NEXT: fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1]
+; CHECK-NEXT: ret
+ <vscale x 2 x double> %zm) {
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm, i32 1)
+ %slice.7 = add i32 %slice, 7
+ call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice.7,
+ <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+ <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
+ <vscale x 2 x double> %zm, i32 1)
+ ret void
+}
+
+declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>,
+ <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>,
+ <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>,
+ <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>,
+ <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
+
+declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
More information about the llvm-commits
mailing list