[llvm] 6892d54 - [Clang][LLVM] Implement single-single vectors MOP4{A/S} (#127797)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 1 05:35:15 PDT 2025
Author: Virginia Cangelosi
Date: 2025-04-01T13:35:09+01:00
New Revision: 6892d5428600113dade7b4ecf6b70bbab3198c90
URL: https://github.com/llvm/llvm-project/commit/6892d5428600113dade7b4ecf6b70bbab3198c90
DIFF: https://github.com/llvm/llvm-project/commit/6892d5428600113dade7b4ecf6b70bbab3198c90.diff
LOG: [Clang][LLVM] Implement single-single vectors MOP4{A/S} (#127797)
Implement all single-single {BF/F/S/U/SU/US}MOP4{A/S} instructions in
clang and llvm following the acle in
https://github.com/ARM-software/acle/pull/381/files
Added:
clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp
llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
Modified:
clang/include/clang/Basic/arm_sme.td
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
llvm/lib/Target/AArch64/SMEInstrFormats.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 288a8c04c217f..5012874a08790 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -289,6 +289,87 @@ multiclass ZAFPOuterProd<string n_suffix> {
defm SVMOPA : ZAFPOuterProd<"mopa">;
defm SVMOPS : ZAFPOuterProd<"mops">;
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - FMOP4A, FMOP4S, BFMOP4A, BFMOP4S
+
+multiclass MOP4<string mode, string za, string t, string i, list<ImmCheck> checks> {
+ def _1x1 : Inst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{d}]", "vidd", t, MergeNone, i # "_1x1", [IsInOutZA, IsStreaming], checks>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4" in {
+ defm SVFMOP4A_HtoS : MOP4<"a", "_za32", "hb", "aarch64_sme_mop4a_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVFMOP4S_HtoS : MOP4<"s", "_za32", "hb", "aarch64_sme_mop4s_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVFMOP4A_S : MOP4<"a", "_za32", "f", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVFMOP4S_S : MOP4<"s", "_za32", "f", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_3>]>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-f64f64" in {
+ defm SVFMOP4A_D : MOP4<"a", "_za64", "d", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVFMOP4S_D : MOP4<"s", "_za64", "d", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_7>]>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-f16f16" in {
+ defm SVFMOP4A_H : MOP4<"a", "_za16", "h", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_1>]>;
+ defm SVFMOP4S_H : MOP4<"s", "_za16", "h", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_1>]>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-b16b16" in {
+ defm SVBMOP4A_H : MOP4<"a", "_za16", "b", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_1>]>;
+ defm SVBMOP4S_H : MOP4<"s", "_za16", "b", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_1>]>;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - SMOP4A, SMOP4S, UMOP4A, UMOP4S
+
+let SMETargetGuard = "sme2,sme-mop4" in {
+ defm SVSMOP4A_H : MOP4<"a", "_za32", "cs", "aarch64_sme_smop4a_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVSMOP4S_H : MOP4<"s", "_za32", "cs", "aarch64_sme_smop4s_wide", [ImmCheck<0, ImmCheck0_3>]>;
+
+ defm SVUMOP4A_H : MOP4<"a", "_za32", "UcUs", "aarch64_sme_umop4a_wide", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVUMOP4S_H : MOP4<"s", "_za32", "UcUs", "aarch64_sme_umop4s_wide", [ImmCheck<0, ImmCheck0_3>]>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-i16i64" in {
+ defm SVSMOP4A_HtoD : MOP4<"a", "_za64", "s", "aarch64_sme_smop4a_za64_wide", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVSMOP4S_HtoD : MOP4<"s", "_za64", "s", "aarch64_sme_smop4s_za64_wide", [ImmCheck<0, ImmCheck0_7>]>;
+
+ defm SVUMOP4A_HtoD : MOP4<"a", "_za64", "Us", "aarch64_sme_umop4a_za64_wide", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVUMOP4S_HtoD : MOP4<"s", "_za64", "Us", "aarch64_sme_umop4s_za64_wide", [ImmCheck<0, ImmCheck0_7>]>;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - SUMOP4A, SUMOP4S, USMOP4A, USMOP4S
+
+multiclass SUMOP4<string mode, string za, string t, string i, list<ImmCheck> checks> {
+ def _1x1 : SInst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{3}]",
+ "vidu", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_1x1",
+ [IsStreaming, IsInOutZA],
+ checks>;
+}
+
+multiclass USMOP4<string mode, string za, string t, string i, list<ImmCheck> checks> {
+ def _1x1 : SInst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{3}]",
+ "vidx", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_1x1",
+ [IsStreaming, IsInOutZA],
+ checks>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4" in {
+ defm SVSUMOP4A_S : SUMOP4<"a", "_za32", "c", "", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVSUMOP4S_S : SUMOP4<"s", "_za32", "c", "", [ImmCheck<0, ImmCheck0_3>]>;
+
+ defm SVUSMOP4A_S : USMOP4<"a", "_za32", "Uc", "", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVUSMOP4S_S : USMOP4<"s", "_za32", "Uc", "", [ImmCheck<0, ImmCheck0_3>]>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-i16i64" in {
+ defm SVSUMOP4A_D : SUMOP4<"a", "_za64", "s", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVSUMOP4S_D : SUMOP4<"s", "_za64", "s", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+
+ defm SVUSMOP4A_D : USMOP4<"a", "_za64", "Us", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+ defm SVUSMOP4S_D : USMOP4<"s", "_za64", "Us", "_za64", [ImmCheck<0, ImmCheck0_7>]>;
+}
+
////////////////////////////////////////////////////////////////////////////////
// SME2 - ADD, SUB
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
new file mode 100644
index 0000000000000..94a839d053479
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c
@@ -0,0 +1,465 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3, A4_UNUSED) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_s8u10__SVInt8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_s8_s8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_s8u10__SVInt8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_s8_s8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_u8_u8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_u8_u8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_s8_u8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_s8_u8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_u8_s8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_u8_s8)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_s16_s16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_s16_s16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_u16_u16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_u16_u16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_f16_f16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_f16_f16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za32_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_bf16_bf16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za32_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_bf16_bf16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za64,_s16_s16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_s16u11__SVInt16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za64,_s16_s16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za64,_u16_u16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_u16u12__SVUint16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za64,_u16_u16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za64,_s16_u16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za64,_s16_u16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za64,_u16_s16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za64,_u16_s16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za16_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za16,_f16_f16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za16_f16_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za16_f16_f16u13__SVFloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za16,_f16_f16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f32_f32u13__SVFloat32_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_f32_f32)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za32_f32_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f32_f32u13__SVFloat32_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za32,_f32_f32)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_f64_f64u13__SVFloat64_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za64,_f64_f64)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za64_f64_f64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_f64_f64u13__SVFloat64_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za64,_f64_f64)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4a_1x1_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za16_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a,_1x1,_za16,_bf16_bf16)(1, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmop4s_1x1_za16_bf16_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za16_bf16_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4s_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4s,_1x1,_za16,_bf16_bf16)(1, zn, zm);
+}
diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp
new file mode 100644
index 0000000000000..556cb1742dbbd
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp
@@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sme -target-feature +sme2p2 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+void tests_mop4_imm_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4a_1x1_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+
+ svmop4a_1x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmop4s_1x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ return;
+}
+
+void tests_mop4_imm_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+
+ svmop4a_1x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmop4s_1x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ return;
+}
+
+void tests_mop4_imm_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmop4s_1x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmop4a_1x1_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmop4s_1x1_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ return;
+}
+
+void tests_mop4_imm_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+
+ svmop4a_1x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svmop4s_1x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ return;
+}
+
+void tests_mop4_imm_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+
+ svmop4a_1x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svmop4s_1x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ return;
+
+}
+
+void tests_mop4_imm_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svmop4s_1x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmop4s_1x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ return;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6dfc3c8f2a393..fe8769154b1da 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3064,6 +3064,28 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic;
+ class SME_OuterProduct_QuarterTile_Single_Single
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty,
+ LLVMMatchType<0>], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+
+ // 2-way and 4-way multi-vector signed/unsigned Quarter Tile Quarter Product A/S
+ foreach mode = ["s", "a"] in {
+ foreach za = ["", "_za64"] in {
+ foreach ty = ["s", "u", "su", "us"] in {
+ def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x1" : SME_OuterProduct_QuarterTile_Single_Single;
+ }
+ }
+ }
+
+ // 2-way and 4-way multi-vector floating point Quarter Tile Quarter Product A/S
+ foreach mode = ["s", "a"] in {
+ foreach wide = ["", "_wide"] in {
+ def int_aarch64_sme_mop4 # mode # wide # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single;
+ }
+ }
+
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
@@ -3835,7 +3857,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_luti4_lane_zt_x2
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
-
+
//
// Lookup table expand four registers
//
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index d2aa86f388db2..f992f73171e0e 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -148,30 +148,30 @@ defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme
}
let Predicates = [HasSME_MOP4] in {
- defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a">;
- defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s">;
- defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a">;
- defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s">;
- defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a">;
- defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s">;
- defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a">;
- defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s">;
-
- defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a">;
- defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s">;
- defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a">;
- defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s">;
+ defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">;
+ defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_wide">;
+ defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_wide">;
+ defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_wide">;
+ defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">;
+
+ defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">;
}
let Predicates = [HasSME_MOP4, HasSMEI16I64] in {
- defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a">;
- defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s">;
- defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a">;
- defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s">;
- defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a">;
- defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s">;
- defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a">;
- defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s">;
+ defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_za64_wide">;
+ defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_za64_wide">;
+ defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_za64_wide">;
+ defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_za64_wide">;
+ defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_za64_wide">;
+ defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_za64_wide">;
+ defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_za64_wide">;
+ defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_za64_wide">;
}
let Predicates = [HasSME_TMOP] in {
@@ -1054,14 +1054,14 @@ let Predicates = [HasSME2, HasSVEBFSCALE] in {
}
let Predicates = [HasSME_MOP4] in {
- defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a">;
- defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s">;
+ defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a", "int_aarch64_sme_mop4a_wide">;
+ defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s", "int_aarch64_sme_mop4s_wide">;
- defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a", "int_aarch64_sme_mop4a_wide">;
+ defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s", "int_aarch64_sme_mop4s_wide">;
- defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
+ defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
}
let Predicates = [HasSME_TMOP] in {
@@ -1084,7 +1084,7 @@ let Predicates = [HasSME_TMOP, HasSMEB16B16] in {
let Predicates = [HasSME_TMOP, HasSMEF8F32], Uses = [FPMR, FPCR] in {
def FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, "ftmopa">;
-}
+}
let Predicates = [HasSME_TMOP, HasSMEF8F16], Uses = [FPMR, FPCR] in {
def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">;
@@ -1099,8 +1099,8 @@ let Predicates = [HasSME_TMOP, HasSMEF16F16] in {
}
let Predicates = [HasSME_MOP4, HasSMEF16F16] in {
- defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
+ defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
}
let Predicates = [HasSME2, HasSVEBFSCALE] in {
@@ -1115,11 +1115,11 @@ let Predicates = [HasSME_MOP4, HasSMEF8F32] in {
}
let Predicates = [HasSME_MOP4, HasSMEB16B16] in {
- defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a">;
- defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s">;
+ defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a", "int_aarch64_sme_mop4a">;
+ defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s", "int_aarch64_sme_mop4s">;
}
let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
- defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a">;
- defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s">;
+ defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
+ defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
}
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4f6a413ba5e5c..54c63ead059ae 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,15 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
let usesCustomInserter = 1;
}
+class sme2_quarter_tile_outer_product_pseudo<RegisterOperand zn_ty, RegisterOperand zm_ty, SMEMatrixTypeEnum za_flag>
+ : Pseudo<(outs), (ins i32imm:$tile,
+ zn_ty:$zn, zm_ty:$zm), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
: SMEPseudo2Instr<name, 0>,
@@ -257,6 +266,9 @@ class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType ou
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
+class SME2_ZA_Tile_Vec_Single_Single_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt>
+ : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm),
+ (!cast<Instruction>(name # _PSEUDO) $tile, $Zn, $Zm)>;
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
@@ -600,9 +612,14 @@ class sme_quarter_outer_product_i16_i32<bit u0, bit N, bit M, bit subtr, Registe
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic>{
+multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{
def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr,
- ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>;
+ ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_BToS, 1>;
+
+ def NAME # _MZZ_BToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_BToS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_BToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>;
+
def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>;
def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr,
@@ -611,9 +628,14 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin
ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>;
}
-multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic>{
+multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic, string op>{
def _MZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b0, subtr,
- ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>;
+ ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HToS, 1>;
+
+ def NAME # _MZZ_HToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HToS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_HToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8i16>;
+
def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b0, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b1, subtr,
@@ -622,9 +644,14 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne
ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>;
}
-multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic>{
+multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{
def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr,
- ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>;
+ ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 1>;
+
+ def NAME # _MZZ_HtoD_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_HtoD, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv8i16>;
+
def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr,
ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>;
def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr,
@@ -5470,9 +5497,13 @@ class sme2_bf16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_bfmop4as_widening<bit S, string mnemonic> {
+multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> {
// Single vectors
- def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_S, 1>;
+
+ def NAME # _MZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5617,9 +5648,13 @@ class sme2_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> {
// Single vectors
- def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_H, 1>;
+
+ def NAME # _MZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5689,9 +5724,13 @@ class sme2_bf16_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic> {
+multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> {
// Single vectors
- def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_H, 1>;
+
+ def NAME # _MZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_H, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8bf16>;
// Multiple and single vectors
def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
@@ -5726,9 +5765,13 @@ class sme2_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, string op> {
// Single vectors
- def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>;
+ def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_S, 1>;
+
+ def NAME # _MZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_S, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv4f32>;
// Multiple and single vectors
def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>;
@@ -5763,9 +5806,13 @@ class sme2_fp64_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic,
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> {
// Single vectors
- def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>;
+ def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_D, 1>;
+
+ def NAME # _MZZ_D_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR64Mul2_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _MZZ_D, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_D, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv2f64>;
// Multiple and single vectors
def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>;
@@ -5800,9 +5847,13 @@ class sme2_fp16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnem
let Constraints = "$ZAda = $_ZAda";
}
-multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic> {
+multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, string op> {
// Single vectors
- def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+ def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 1>;
+
+ def NAME # _MZZ_HtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR16Mul2_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_HtoS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_HtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8f16>;
// Multiple and single vectors
def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
new file mode 100644
index 0000000000000..ec899fab7cf21
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll
@@ -0,0 +1,419 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+; Widening
+define void @mop4a_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4a za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4s za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4a za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4s za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4a za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u8_s8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4s za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4a za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4s za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4a za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za0.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4a_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4a za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4a za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: umop4s za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4a za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16_u16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: sumop4s za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4a za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_u16_s16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: usmop4s za0.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+; Non-widening
+define void @mop4a_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za16_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za0.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 0, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4a_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.s, z0.s, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za0.s, z0.s, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 0, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4a_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4a_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.d, z0.d, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4s_za64_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za0.d, z0.d, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 0, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4a za0.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za0.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+; Tile limits
+define void @mop4s_za32_s8_limit(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s8_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za3.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4s_za32_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_s16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za3.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za3.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za32_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_bf16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za3.s, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+define void @mop4s_za64_s16_limit(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_s16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: smop4s za7.d, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 7, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
+ ret void
+}
+
+define void @mop4s_za64_f64_limit(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) #0 {
+; CHECK-LABEL: mop4s_za64_f64_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za7.d, z0.d, z24.d
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 7, <vscale x 2 x double> %zn, <vscale x 2 x double> %zm)
+ ret void
+}
+
+define void @mop4s_za32_f32_limit(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) #0 {
+; CHECK-LABEL: mop4s_za32_f32_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za3.s, z0.s, z24.s
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, <vscale x 4 x float> %zn, <vscale x 4 x float> %zm)
+ ret void
+}
+
+define void @mop4s_za16_f16_limit(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_f16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
+ ret void
+}
+
+define void @mop4s_za16_bf16_limit(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) #0 {
+; CHECK-LABEL: mop4s_za16_bf16_limit:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+ ret void
+}
+
+attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" }
More information about the llvm-commits
mailing list