[llvm] aa39882 - [AArch64][SME2] Add SME2 outer product intrinsics
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 31 03:45:07 PST 2023
Author: Kerry McLaughlin
Date: 2023-01-31T11:44:48Z
New Revision: aa39882447bd4400f0da11521c5b4dcbab5c2dde
URL: https://github.com/llvm/llvm-project/commit/aa39882447bd4400f0da11521c5b4dcbab5c2dde
DIFF: https://github.com/llvm/llvm-project/commit/aa39882447bd4400f0da11521c5b4dcbab5c2dde.diff
LOG: [AArch64][SME2] Add SME2 outer product intrinsics
Adds intrinsics for the following:
- smopa / smops
- umopa / umops
- bmopa / bmops
Tests for existing SME mopa/mops intrinsics have also been updated
to use the maximum allowed ZA tile number.
NOTE: These intrinsics are still in development and are subject
to future changes.
Reviewed By: david-arm
Differential Revision: https://reviews.llvm.org/D141849
Added:
llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
llvm/lib/Target/AArch64/SMEInstrFormats.td
llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index da7f575a4786e..45a41c2c4ecf7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2976,6 +2976,18 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_fmla_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
def int_aarch64_sme_fmls_lane_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
+ //
+ // Outer product and accumulate/subtract intrinsics
+ //
+
+ def int_aarch64_sme_smopa_za32 : SME_OuterProduct_Intrinsic;
+ def int_aarch64_sme_umopa_za32 : SME_OuterProduct_Intrinsic;
+ def int_aarch64_sme_smops_za32 : SME_OuterProduct_Intrinsic;
+ def int_aarch64_sme_umops_za32 : SME_OuterProduct_Intrinsic;
+
+ def int_aarch64_sme_bmopa_za32 : SME_OuterProduct_Intrinsic;
+ def int_aarch64_sme_bmops_za32 : SME_OuterProduct_Intrinsic;
+
// Multi-vector saturating rounding shift right intrinsics
def int_aarch64_sve_sqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 0f7d09a2a4ac2..939245bc0efe6 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -563,14 +563,14 @@ defm UMLSLL_VG4_M4ZZ_BtoS : sme2_mla_ll_array_vg24_single<"umlsll", 0b01110, Ma
defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r>;
defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r>;
-defm BMOPA_MPPZZ_S : sme2_bfp_mopx_tile<"bmopa", 0b100>;
-defm BMOPS_MPPZZ_S : sme2_bfp_mopx_tile<"bmops", 0b101>;
+defm BMOPA_MPPZZ_S : sme2_int_bmopx_tile<"bmopa", 0b100, int_aarch64_sme_bmopa_za32>;
+defm BMOPS_MPPZZ_S : sme2_int_bmopx_tile<"bmops", 0b101, int_aarch64_sme_bmops_za32>;
-defm SMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"smopa", 0b000>;
-defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001>;
+defm SMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"smopa", 0b000, int_aarch64_sme_smopa_za32>;
+defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001, int_aarch64_sme_smops_za32>;
-defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100>;
-defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101>;
+defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>;
+defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>;
def ZERO_T : sme2_zero_zt<"zero", 0b0001>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index bf8f241427345..43562b8ebbd34 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -182,6 +182,14 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
: Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>;
+//===----------------------------------------------------------------------===//
+// SME pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+class SME_ZA_Tile_TwoPred_TwoVec_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType pg_ty, ValueType vt>
+ : Pat<(intrinsic imm_ty:$tile, (pg_ty PPR3bAny:$Pn), (pg_ty PPR3bAny:$Pm), vt:$Zn, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $tile, $Pn, $Pm, $Zn, $Zm)>;
+
//===----------------------------------------------------------------------===//
// SME Outer Products
//===----------------------------------------------------------------------===//
@@ -220,9 +228,7 @@ multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op>
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
- def : Pat<(op timm32_0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
- (nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)),
- (!cast<Instruction>(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>;
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_3, nxv4i1, nxv4f32>;
}
multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> {
@@ -233,9 +239,7 @@ multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op>
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR64, SMEMatrixTileD>, SMEPseudo2Instr<NAME, 0>;
- def : Pat<(op timm32_0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
- (nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)),
- (!cast<Instruction>(NAME # _PSEUDO) timm32_0_7:$tile, $pn, $pm, $zn, $zm)>;
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_7, nxv2i1, nxv2f64>;
}
multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s>{
@@ -284,9 +288,7 @@ multiclass sme_int_outer_product_i32<bits<3> opc, string mnemonic,
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR8, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
- def : Pat<(op timm32_0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm),
- (nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)),
- (!cast<Instruction>(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>;
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_3, nxv16i1, nxv16i8>;
}
multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic,
@@ -299,9 +301,7 @@ multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic,
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16, SMEMatrixTileD>, SMEPseudo2Instr<NAME, 0>;
- def : Pat<(op timm32_0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
- (nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)),
- (!cast<Instruction>(NAME # _PSEUDO) timm32_0_7:$tile, $pn, $pm, $zn, $zm)>;
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_7, nxv8i1, nxv8i16>;
}
class sme_outer_product_widening_inst<bits<3> opc, ZPRRegOp zpr_ty, string mnemonic>
@@ -336,9 +336,7 @@ multiclass sme_bf16_outer_product<bits<3> opc, string mnemonic, SDPatternOperato
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
- def : Pat<(op timm32_0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
- (nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)),
- (!cast<Instruction>(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>;
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_3, nxv8i1, nxv8bf16>;
}
multiclass sme_f16_outer_product<bits<3> opc, string mnemonic, SDPatternOperator op> {
@@ -346,9 +344,7 @@ multiclass sme_f16_outer_product<bits<3> opc, string mnemonic, SDPatternOperator
def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
- def : Pat<(op timm32_0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm),
- (nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)),
- (!cast<Instruction>(NAME # _PSEUDO) timm32_0_3:$tile, $pn, $pm, $zn, $zm)>;
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_3, nxv8i1, nxv8f16>;
}
//===----------------------------------------------------------------------===//
@@ -2839,16 +2835,24 @@ multiclass sme2_mla_ll_array_vg4_multi<string mnemonic, bits<4> op,
//===----------------------------------------------------------------------===//
// SME2 Outer Product and Accumulate
-multiclass sme2_int_mopx_tile<string mnemonic, bits<3> op> {
- def NAME : sme_int_outer_product_inst<op, 0b0, 0b1, TileOp32, ZPR16, mnemonic> {
+multiclass sme2_int_mopx_tile<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
+ def NAME : sme_int_outer_product_inst<op, 0b0, 0b1, TileOp32, ZPR16, mnemonic>, SMEPseudo2Instr<NAME, 1> {
bits<2> ZAda;
let Inst{1-0} = ZAda;
let Inst{2} = 0b0;
}
+
+ def _PSEUDO : sme_outer_product_pseudo<ZPR16, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
+
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, intrinsic, timm32_0_3, nxv8i1, nxv8i16>;
}
-multiclass sme2_bfp_mopx_tile<string mnemonic, bits<3> op> {
- def NAME : sme_outer_product_widening_inst<op, ZPR32, mnemonic>;
+multiclass sme2_int_bmopx_tile<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
+ def NAME : sme_outer_product_widening_inst<op, ZPR32, mnemonic>, SMEPseudo2Instr<NAME, 1>;
+
+ def _PSEUDO : sme_outer_product_pseudo<ZPR32, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
+
+ def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, intrinsic, timm32_0_3, nxv4i1, nxv4i32>;
}
//===----------------------------------------------------------------------===///
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
index 6f722189bcc34..364ecaaa2a3fb 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mopa.ll
@@ -4,36 +4,36 @@
define void @bfmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: bfmopa:
; CHECK: // %bb.0:
-; CHECK-NEXT: bfmopa za0.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: bfmopa za3.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
define void @fmopa(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
; CHECK-LABEL: fmopa:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmopa za1.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
define void @smopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: smopa_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: smopa za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT: smopa za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @smopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: smopa_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: smopa za0.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: smopa za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -49,54 +49,54 @@ define void @umopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 1
define void @umopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: umopa_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: umopa za1.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: umopa za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @fmopa_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
; CHECK-LABEL: fmopa_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmopa za0.s, p0/m, p1/m, z0.s, z1.s
+; CHECK-NEXT: fmopa za3.s, p0/m, p1/m, z0.s, z1.s
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
}
define void @fmopa_d(<vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #1 {
; CHECK-LABEL: fmopa_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmopa za2.d, p0/m, p1/m, z0.d, z1.d
+; CHECK-NEXT: fmopa za7.d, p0/m, p1/m, z0.d, z1.d
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mopa.nxv2f64(i32 2, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
}
define void @sumopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: sumopa_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: sumopa za1.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT: sumopa za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 1, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @sumopa_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: sumopa_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: sumopa za3.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: sumopa za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @usmopa_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: usmopa_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: usmopa za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT: usmopa za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
index 85b7ea678f3db..aec01fa5b9605 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mops.ll
@@ -4,36 +4,36 @@
define void @bfmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: bfmops:
; CHECK: // %bb.0:
-; CHECK-NEXT: bfmops za0.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: bfmops za3.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
ret void
}
define void @fmops(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
; CHECK-LABEL: fmops:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmops za1.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: fmops za3.s, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
ret void
}
define void @smops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: smops_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: smops za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT: smops za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @smops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: smops_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: smops za0.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: smops za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 0, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
@@ -49,54 +49,54 @@ define void @umops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 1
define void @umops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: umops_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: umops za1.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: umops za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 1, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @fmops_s(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
; CHECK-LABEL: fmops_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmops za0.s, p0/m, p1/m, z0.s, z1.s
+; CHECK-NEXT: fmops za3.s, p0/m, p1/m, z0.s, z1.s
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mops.nxv4f32(i32 0, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ call void @llvm.aarch64.sme.mops.nxv4f32(i32 3, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
ret void
}
define void @fmops_d(<vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #1 {
; CHECK-LABEL: fmops_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmops za2.d, p0/m, p1/m, z0.d, z1.d
+; CHECK-NEXT: fmops za7.d, p0/m, p1/m, z0.d, z1.d
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.mops.nxv2f64(i32 2, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, <vscale x 2 x i1> %pn, <vscale x 2 x i1> %pm, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
ret void
}
define void @sumops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: sumops_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: sumops za1.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT: sumops za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 1, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
define void @sumops_d(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
; CHECK-LABEL: sumops_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: sumops za3.d, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: sumops za7.d, p0/m, p1/m, z0.h, z1.h
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 7, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
ret void
}
define void @usmops_s(<vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: usmops_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: usmops za2.s, p0/m, p1/m, z0.b, z1.b
+; CHECK-NEXT: usmops za3.s, p0/m, p1/m, z0.b, z1.b
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 2, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 3, <vscale x 16 x i1> %pn, <vscale x 16 x i1> %pm, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll
new file mode 100644
index 0000000000000..8b17dad8d01c1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+; MOPA/MOPS
+
+define void @outer_sum_accumulate_s16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: outer_sum_accumulate_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smopa za3.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @outer_sum_accumulate_u16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: outer_sum_accumulate_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umopa za3.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @outer_sum_subtract_s16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: outer_sum_subtract_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: smops za3.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smops.za32.nxv8i16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @outer_sum_subtract_u16(<vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
+; CHECK-LABEL: outer_sum_subtract_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umops za3.s, p0/m, p1/m, z0.h, z1.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umops.za32.nxv8i16(i32 3, <vscale x 8 x i1> %pn, <vscale x 8 x i1> %pm, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+;
+; BMOPA/BMOPS
+;
+
+define void @bitwise_outer_sum_accumulate_u32(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) {
+; CHECK-LABEL: bitwise_outer_sum_accumulate_u32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bmopa za3.s, p0/m, p1/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32 3, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm)
+ ret void
+}
+
+define void @bitwise_outer_sum_subtract_u32(<vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) {
+; CHECK-LABEL: bitwise_outer_sum_subtract_u32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bmops za3.s, p0/m, p1/m, z0.s, z1.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32 3, <vscale x 4 x i1> %pn, <vscale x 4 x i1> %pm, <vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm)
+ ret void
+}
+
+declare void @llvm.aarch64.sme.smopa.za32.nxv8i16(i32, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umopa.za32.nxv8i16(i32, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.smops.za32.nxv8i16(i32, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare void @llvm.aarch64.sme.umops.za32.nxv8i16(i32, <vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare void @llvm.aarch64.sme.bmopa.za32.nxv4i32(i32, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sme.bmops.za32.nxv4i32(i32, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+
More information about the llvm-commits
mailing list